## Germany

In [1]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta

# Initialize Faker once
fake = Faker("de_CH")
Faker.seed(42)
random.seed(42)

# Canton and District Mapping
canton_districts = {
    "Argovie (AG)": ["Aarau", "Baden", "Bremgarten", "Brugg", "Kulm", "Laufenburg", "Lenzburg", "Muri", "Rheinfelden", "Zofingue", "Zurzach"],
    "Appenzell": ["Appenzell", "Gonten", "Oberegg"],
    "Bâle-Campagne": ["Arlesheim", "Laufen", "Liestal", "Sissach", "Waldenburg"],
    "Bâle-Ville (BS)": ["Bâle-Ville"],
    "Berne (BE)": ["Berne-Mittelland", "Bienne", "Emmental", "Frutigen-Niedersimmental", "Interlaken-Oberhasli", "Jura bernois", "Oberaargau", "Obersimmental-Saanen", "Seeland", "Thoune"],
    "Fribourg (FR)": ["Broye", "Glâne", "Gruyère", "Sarine", "Lac", "Singine", "Veveyse"],
    "Genève (GE)": ["Genève"],
    "Glaris (GL)": ["Glaris"],
    "Grisons (GR)": ["Albula", "Bernina", "Engadine Basse/Val Müstair", "Imboden", "Landquart", "Maloja", "Moesa", "Plessur", "Prättigau/Davos", "Surselva", "Viamala"],
    "Jura (JU)": ["Delémont", "Porrentruy", "Franches-Montagnes"],
    "Lucerne (LU)": ["Entlebuch", "Hochdorf", "Lucerne-Campagne", "Lucerne-Ville", "Sursee", "Willisau"],
    "Neuchâtel (NE)": ["Boudry", "La Chaux-de-Fonds", "Le Locle", "Neuchâtel", "Val-de-Ruz", "Val-de-Travers"],
    "Nidwald (NW)": ["Nidwald"],
    "Obwald (OW)": ["Obwald"],
    "Schaffhouse (SH)": ["Stein", "Schaffhouse", "Schleitheim"],
    "Schwytz (SZ)": ["Einsiedeln", "Gersau", "Höfe", "Küssnacht", "March", "Schwytz"],
    "Soleure (SO)": ["Wasseramt-Bucheggberg", "Dorneck-Thierstein", "Thal-Gäu", "Olten-Gösgen", "Soleure-Lebern"],
    "Saint-Gall (SG)": ["Rheintal", "Rorschach", "Sarganserland", "See-Gaster", "Saint-Gall", "Toggenburg", "Werdenberg", "Wil"],
    "Thurgovie (TG)": ["Arbon", "Frauenfeld", "Kreuzlingen", "Münchwilen", "Weinfelden"],
    "Tessin (TI)": ["Bellinzone", "Blenio", "Léventine", "Locarno", "Lugano", "Mendrisio", "Riviera", "Vallemaggia"],
    "Uri (UR)": ["Uri"],
    "Valais (VS)": ["Brigue", "Conthey", "Entremont", "Goms", "Hérens", "Loèche", "Martigny", "Monthey", "Saint-Maurice", "Sierre", "Sion", "Viège", "Rarogne"],
    "Vaud (VD)": ["Aigle", "Broye-Vully", "Gros-de-Vaud", "Jura-Nord vaudois", "Lausanne", "Lavaux-Oron", "Morges", "Nyon", "Riviera-Pays-d’Enhaut", "Ouest Lausannois"],
    "Zoug (ZG)": ["Zoug"],
    "Zurich (ZH)": ["Andelfingen", "Affoltern", "Bülach", "Dielsdorf", "Dietikon", "Hinwil", "Horgen", "Meilen", "Pfäffikon", "Uster", "Winterthour", "Zurich"]
}

kanton_distribution = {
    "Zurich (ZH)": 15, "Berne (BE)": 12, "Vaud (VD)": 10, "Bâle-Ville (BS)": 7,
    "Fribourg (FR)": 5, "Saint-Gall (SG)": 5, "Valais (VS)": 6, "Genève (GE)": 5,
    "Tessin (TI)": 4, "Grisons (GR)": 4, "Lucerne (LU)": 4, "Appenzell": 3,
    "Bâle-Campagne": 3, "Schwytz (SZ)": 3, "Schaffhouse (SH)": 2, "Thurgovie (TG)": 2,
    "Jura (JU)": 2, "Soleure (SO)": 2, "Neuchâtel (NE)": 1, "Glaris (GL)": 1,
    "Uri (UR)": 1, "Nidwald (NW)": 1, "Obwald (OW)": 1, "Zoug (ZG)": 1
}

genders = ["Weiblich", "Männlich", "Andere"]
ages = ["16–22", "23–28", "29–34", "35–40", "41–46", "47–52", "53–58", "59–64", "65–70", ">70"]
schemes = [
    "Alters- und Hinterlassenenversicherung (AHV)", "Invalidenversicherung (IV)",
    "Arbeitslosenversicherung (ALV)", "Familienzulagen (FA)",
    "Kranken-/Unfallversicherung (KVG/UVG)", "Berufliche Vorsorge (BVG)"
]
assistance_issues = [
    "Zahlung ausstehend", "Problem mit Leistung", "Behindertenhilfe",
    "Bestechung verlangt", "Hilfe bei Antrag", "Website-/App-Problem", "Anderer Grund",
    "SMS zur Genehmigung fehlt", "Formularproblem", "Dokument-/Bankproblem"
]

start_date = datetime.strptime("10.10.2024", "%d.%m.%Y")
end_date = datetime.strptime("31.01.2025", "%d.%m.%Y")
date_list = [(start_date + timedelta(days=x)) for x in range((end_date - start_date).days + 1)]

# Base settings with daily variance
base_records_per_day = 15000
daily_record_counts = [random.randint(int(base_records_per_day * 0.95), int(base_records_per_day * 1.05)) for _ in date_list]
total_records = sum(daily_record_counts)
repeat_percentage = 0.35
repeat_records = int(total_records * repeat_percentage)
unique_records = total_records - repeat_records
incomplete_percentage = 0.40
incomplete_records = int(total_records * incomplete_percentage)

sequential_fields = ["Sprach", "Kanton", "Bezirk", "Geschlecht", "Altersgruppe", "Leistung", "Unterstützung"]

data = []
unique_persons = []

# Prepare per-date list for unique record assignment
record_dates = []
for date, count in zip(date_list, daily_record_counts):
    for _ in range(count):
        record_dates.append(date)

random.shuffle(record_dates)

# Step 1: Generate unique records
kanton_list = list(kanton_distribution.keys())
kanton_weights = [kanton_distribution[k] for k in kanton_list]

for date in record_dates[:unique_records]:
    name = fake.name()
    mobile = "+41 " + fake.msisdn()[3:5] + " " + fake.msisdn()[5:7] + " " + fake.msisdn()[7:9] + " " + fake.msisdn()[9:11]

    canton = random.choices(kanton_list, weights=kanton_weights, k=1)[0]
    district = random.choices(canton_districts[canton], weights=[random.randint(1, 5) for _ in canton_districts[canton]])[0]

    gender = random.choices(genders, weights=[38, 61, 1])[0]
    age = random.choices(ages, weights=[5, 10, 15, 20, 15, 10, 10, 10, 3, 2])[0]
    scheme = random.choices(schemes, weights=[30, 20, 15, 10, 15, 10])[0]
    issue = random.choices(assistance_issues, weights=[25, 15, 10, 2, 20, 15, 5, 3, 3, 2])[0]

    language = "Deutsch"
    time = timedelta(seconds=random.randint(0, 86399))
    datetime_str = (date + time).strftime("%d.%m.%Y %H:%M:%S")

    row_data = {
        "Name": name,
        "Telefonnummer": mobile,
        "Sprach": language,
        "Kanton": canton,
        "Bezirk": district,
        "Geschlecht": gender,
        "Altersgruppe": age,
        "Leistung": scheme,
        "Unterstützung": issue,
        "Aktualisierungszeitpunkt": datetime_str
    }

    data.append(row_data)
    unique_persons.append((name, mobile, canton, district, gender, age))

# Step 2: Generate repeat records
repeat_dates = random.choices(date_list, k=repeat_records)

for date in repeat_dates:
    name, mobile, canton, district, gender, age = random.choice(unique_persons)
    scheme = random.choices(schemes, weights=[30, 20, 15, 10, 15, 10])[0]
    issue = random.choices(assistance_issues, weights=[25, 15, 10, 2, 20, 15, 5, 3, 3, 2])[0]
    language = "Deutsch"
    time = timedelta(seconds=random.randint(0, 86399))
    datetime_str = (date + time).strftime("%d.%m.%Y %H:%M:%S")

    row_data = {
        "Name": name,
        "Telefonnummer": mobile,
        "Sprach": language,
        "Kanton": canton,
        "Bezirk": district,
        "Geschlecht": gender,
        "Altersgruppe": age,
        "Leistung": scheme,
        "Unterstützung": issue,
        "Aktualisierungszeitpunkt": datetime_str
    }

    data.append(row_data)

# Shuffle the data
random.shuffle(data)

# Convert to DataFrame
df = pd.DataFrame(data)

# Step 3: Simulate incomplete chats
indices_to_blank = random.sample(range(len(df)), k=incomplete_records)
for idx in indices_to_blank:
    cutoff = random.randint(0, len(sequential_fields))
    for field in sequential_fields[cutoff:]:
        df.at[idx, field] = ""

# Save to Excel
output_path = r"C:\Users\hp\Desktop\Github\Power-BI\1 ChatBot Journey Insights and Performance Dashboard\1 RAW DATA\german_dummy_data.xlsx"


max_rows = 1048576
num_chunks = (len(df) // max_rows) + 1

with pd.ExcelWriter(output_path) as writer:
    for i in range(num_chunks):
        start_row = i * max_rows
        end_row = min((i + 1) * max_rows, len(df))
        df.iloc[start_row:end_row].to_excel(writer, sheet_name=f'Sheet{i+1}', index=False)


## French

In [2]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta

# Initialize Faker
fake = Faker("fr_CH")
Faker.seed(42)
random.seed(42)

# Canton and District Mapping
canton_districts = {
    "Argovie (AG)": ["Aarau", "Baden", "Bremgarten", "Brugg", "Kulm", "Laufenburg", "Lenzburg", "Muri", "Rheinfelden", "Zofingue", "Zurzach"],
    "Appenzell": ["Appenzell", "Gonten", "Oberegg"],
    "Bâle-Campagne": ["Arlesheim", "Laufen", "Liestal", "Sissach", "Waldenburg"],
    "Bâle-Ville (BS)": ["Bâle-Ville"],
    "Berne (BE)": ["Berne-Mittelland", "Bienne", "Emmental", "Frutigen-Niedersimmental", "Interlaken-Oberhasli", "Jura bernois", "Oberaargau", "Obersimmental-Saanen", "Seeland", "Thoune"],
    "Fribourg (FR)": ["Broye", "Glâne", "Gruyère", "Sarine", "Lac", "Singine", "Veveyse"],
    "Genève (GE)": ["Genève"],
    "Glaris (GL)": ["Glaris"],
    "Grisons (GR)": ["Albula", "Bernina", "Engadine Basse/Val Müstair", "Imboden", "Landquart", "Maloja", "Moesa", "Plessur", "Prättigau/Davos", "Surselva", "Viamala"],
    "Jura (JU)": ["Delémont", "Porrentruy", "Franches-Montagnes"],
    "Lucerne (LU)": ["Entlebuch", "Hochdorf", "Lucerne-Campagne", "Lucerne-Ville", "Sursee", "Willisau"],
    "Neuchâtel (NE)": ["Boudry", "La Chaux-de-Fonds", "Le Locle", "Neuchâtel", "Val-de-Ruz", "Val-de-Travers"],
    "Nidwald (NW)": ["Nidwald"],
    "Obwald (OW)": ["Obwald"],
    "Schaffhouse (SH)": ["Stein", "Schaffhouse", "Schleitheim"],
    "Schwytz (SZ)": ["Einsiedeln", "Gersau", "Höfe", "Küssnacht", "March", "Schwytz"],
    "Soleure (SO)": ["Wasseramt-Bucheggberg", "Dorneck-Thierstein", "Thal-Gäu", "Olten-Gösgen", "Soleure-Lebern"],
    "Saint-Gall (SG)": ["Rheintal", "Rorschach", "Sarganserland", "See-Gaster", "Saint-Gall", "Toggenburg", "Werdenberg", "Wil"],
    "Thurgovie (TG)": ["Arbon", "Frauenfeld", "Kreuzlingen", "Münchwilen", "Weinfelden"],
    "Tessin (TI)": ["Bellinzone", "Blenio", "Léventine", "Locarno", "Lugano", "Mendrisio", "Riviera", "Vallemaggia"],
    "Uri (UR)": ["Uri"],
    "Valais (VS)": ["Brigue", "Conthey", "Entremont", "Goms", "Hérens", "Loèche", "Martigny", "Monthey", "Saint-Maurice", "Sierre", "Sion", "Viège", "Rarogne"],
    "Vaud (VD)": ["Aigle", "Broye-Vully", "Gros-de-Vaud", "Jura-Nord vaudois", "Lausanne", "Lavaux-Oron", "Morges", "Nyon", "Riviera-Pays-d’Enhaut", "Ouest Lausannois"],
    "Zoug (ZG)": ["Zoug"],
    "Zurich (ZH)": ["Andelfingen", "Affoltern", "Bülach", "Dielsdorf", "Dietikon", "Hinwil", "Horgen", "Meilen", "Pfäffikon", "Uster", "Winterthour", "Zurich"]
}

# Custom weights for cantons (sum = 1.0)
canton_weights = {
    "Argovie (AG)": 0.07,
    "Appenzell": 0.02,
    "Bâle-Campagne": 0.04,
    "Bâle-Ville (BS)": 0.03,
    "Berne (BE)": 0.12,
    "Fribourg (FR)": 0.05,
    "Genève (GE)": 0.06,
    "Glaris (GL)": 0.01,
    "Grisons (GR)": 0.03,
    "Jura (JU)": 0.02,
    "Lucerne (LU)": 0.06,
    "Neuchâtel (NE)": 0.03,
    "Nidwald (NW)": 0.01,
    "Obwald (OW)": 0.01,
    "Schaffhouse (SH)": 0.01,
    "Schwytz (SZ)": 0.03,
    "Soleure (SO)": 0.03,
    "Saint-Gall (SG)": 0.06,
    "Thurgovie (TG)": 0.04,
    "Tessin (TI)": 0.05,
    "Uri (UR)": 0.01,
    "Valais (VS)": 0.05,
    "Vaud (VD)": 0.10,
    "Zoug (ZG)": 0.02,
    "Zurich (ZH)": 0.1
}

# Parameters
genders = ["Femme", "Homme", "Autre"]
ages = ["16–22", "23–28", "29–34", "35–40", "41–46", "47–52", "53–58", "59–64", "65–70", ">70"]
schemes = [
    "Assurance vieillesse et survivants (AVS)", "Assurance invalidité (AI)",
    "Assurance chômage (AC)", "Allocations familiales (AF)",
    "Assurance maladie/accident (LAMal/LAA)", "Prévoyance professionnelle (LPP)"
]
assistance_issues = [
    "Paiement en attente", "Problème de prestation", "Aide handicapée",
    "Demande de pot-de-vin", "Assistance pour postuler", "Problème site/application", "Autre raison",
    "SMS d'approbation manquant", "Problème de formulaire", "Problème de document/compte"
]

start_date = datetime.strptime("10.10.2024", "%d.%m.%Y")
end_date = datetime.strptime("31.01.2025", "%d.%m.%Y")
date_list = [(start_date + timedelta(days=x)) for x in range((end_date - start_date).days + 1)]

total_records = 419420
repeat_percentage = 0.35
repeat_records = int(total_records * repeat_percentage)
unique_records = total_records - repeat_records
incomplete_percentage = 0.40
incomplete_records = int(total_records * incomplete_percentage)

gender_weights = [0.42, 0.57, 0.01]
age_weights = [0.10, 0.15, 0.15, 0.15, 0.12, 0.10, 0.08, 0.05, 0.05, 0.05]
scheme_weights = [0.30, 0.20, 0.10, 0.10, 0.20, 0.10]
issue_weights = [0.25, 0.15, 0.10, 0.02, 0.20, 0.15, 0.5, 0.3, 0.3, 0.2]

sequential_fields = ["Langue", "Canton", "District", "Sexe", "Tranche d'âge", "Prestation", "Problème"]

# Generate data
data = []
unique_persons = []

# Unique records
for _ in range(unique_records):
    name = fake.name()
    mobile = "+41 " + fake.msisdn()[3:5] + " " + fake.msisdn()[5:7] + " " + fake.msisdn()[7:9] + " " + fake.msisdn()[9:11]
    canton = random.choices(list(canton_weights.keys()), weights=list(canton_weights.values()), k=1)[0]
    district = random.choice(canton_districts[canton])
    gender = random.choices(genders, weights=gender_weights, k=1)[0]
    age = random.choices(ages, weights=age_weights, k=1)[0]
    scheme = random.choices(schemes, weights=scheme_weights, k=1)[0]
    issue = random.choices(assistance_issues, weights=issue_weights, k=1)[0]
    language = "Français"
    date = random.choice(date_list)
    time = timedelta(seconds=random.randint(0, 86399))
    datetime_str = (date + time).strftime("%d.%m.%Y %H:%M:%S")

    row_data = {
        "Nom": name,
        "Téléphone": mobile,
        "Langue": language,
        "Canton": canton,
        "District": district,
        "Sexe": gender,
        "Tranche d'âge": age,
        "Prestation": scheme,
        "Problème": issue,
        "Horodatage": datetime_str
    }

    data.append(row_data)
    unique_persons.append((name, mobile, canton, district, gender, age))

# Repeated records
for _ in range(repeat_records):
    name, mobile, canton, district, gender, age = random.choice(unique_persons)
    scheme = random.choices(schemes, weights=scheme_weights, k=1)[0]
    issue = random.choices(assistance_issues, weights=issue_weights, k=1)[0]
    language = random.choices(["Français", "Allemand", "Italien"], weights=[90, 0, 0])[0]
    date = random.choice(date_list)
    time = timedelta(seconds=random.randint(0, 86399))
    datetime_str = (date + time).strftime("%d.%m.%Y %H:%M:%S")

    row_data = {
        "Nom": name,
        "Téléphone": mobile,
        "Langue": language,
        "Canton": canton,
        "District": district,
        "Sexe": gender,
        "Tranche d'âge": age,
        "Prestation": scheme,
        "Problème": issue,
        "Horodatage": datetime_str
    }

    data.append(row_data)

# Shuffle and blank some fields for incomplete records
random.shuffle(data)
df = pd.DataFrame(data)

indices_to_blank = random.sample(range(len(df)), k=incomplete_records)
for idx in indices_to_blank:
    cutoff = random.randint(0, len(sequential_fields))
    for field in sequential_fields[cutoff:]:
        df.at[idx, field] = ""

# Save to Excel
output_path = r"C:\Users\hp\Desktop\Github\Power-BI\1 ChatBot Journey Insights and Performance Dashboard\1 RAW DATA\french_dummy_data.xlsx"
df.to_excel(output_path, sheet_name="Données", index=False)


## English 

In [3]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta

# Initialize Faker
fake = Faker("en")
Faker.seed(42)
random.seed(42)

# Canton and District Mapping
canton_districts = {
    "Aargau (AG)": ["Aarau", "Baden", "Bremgarten", "Brugg", "Kulm", "Laufenburg", "Lenzburg", "Muri", "Rheinfelden", "Zofingen", "Zurzach"],
    "Appenzell": ["Appenzell", "Gonten", "Oberegg"],
    "Basel-Landschaft": ["Arlesheim", "Laufen", "Liestal", "Sissach", "Waldenburg"],
    "Basel-Stadt (BS)": ["Basel-Stadt"],
    "Bern (BE)": ["Bern-Mittelland", "Biel/Bienne", "Emmental", "Frutigen-Niedersimmental", "Interlaken-Oberhasli", "Jura bernois", "Oberaargau", "Obersimmental-Saanen", "Seeland", "Thun"],
    "Fribourg (FR)": ["Broye", "Glâne", "Gruyère", "Sarine", "Lac", "Singine", "Veveyse"],
    "Geneva (GE)": ["Geneva"],
    "Glarus (GL)": ["Glarus"],
    "Grisons (GR)": ["Albula", "Bernina", "Engadine Basse/Val Müstair", "Imboden", "Landquart", "Maloja", "Moesa", "Plessur", "Prättigau/Davos", "Surselva", "Viamala"],
    "Jura (JU)": ["Delémont", "Porrentruy", "Franches-Montagnes"],
    "Lucerne (LU)": ["Entlebuch", "Hochdorf", "Lucerne-Countryside", "Lucerne-City", "Sursee", "Willisau"],
    "Neuchâtel (NE)": ["Boudry", "La Chaux-de-Fonds", "Le Locle", "Neuchâtel", "Val-de-Ruz", "Val-de-Travers"],
    "Nidwalden (NW)": ["Nidwalden"],
    "Obwalden (OW)": ["Obwalden"],
    "Schaffhausen (SH)": ["Stein", "Schaffhausen", "Schleitheim"],
    "Schwyz (SZ)": ["Einsiedeln", "Gersau", "Höfe", "Küssnacht", "March", "Schwyz"],
    "Solothurn (SO)": ["Wasseramt-Bucheggberg", "Dorneck-Thierstein", "Thal-Gäu", "Olten-Gösgen", "Solothurn-Lebern"],
    "St. Gallen (SG)": ["Rheintal", "Rorschach", "Sarganserland", "See-Gaster", "St. Gallen", "Toggenburg", "Werdenberg", "Wil"],
    "Thurgau (TG)": ["Arbon", "Frauenfeld", "Kreuzlingen", "Münchwilen", "Weinfelden"],
    "Ticino (TI)": ["Bellinzona", "Blenio", "Leventina", "Locarno", "Lugano", "Mendrisio", "Riviera", "Vallemaggia"],
    "Uri (UR)": ["Uri"],
    "Valais (VS)": ["Brig", "Conthey", "Entremont", "Goms", "Hérens", "Leuk", "Martigny", "Monthey", "Saint-Maurice", "Sierre", "Sion", "Visp", "Raron"],
    "Vaud (VD)": ["Aigle", "Broye-Vully", "Gros-de-Vaud", "Jura-Nord vaudois", "Lausanne", "Lavaux-Oron", "Morges", "Nyon", "Riviera-Pays-d’Enhaut", "West Lausanne"],
    "Zug (ZG)": ["Zug"],
    "Zurich (ZH)": ["Andelfingen", "Affoltern", "Bülach", "Dielsdorf", "Dietikon", "Hinwil", "Horgen", "Meilen", "Pfäffikon", "Uster", "Winterthur", "Zurich"]
}

# Weights
genders = ["Female", "Male", "Other"]
gender_weights = [0.45, 0.54, 0.01]

ages = ["16–22", "23–28", "29–34", "35–40", "41–46", "47–52", "53–58", "59–64", "65–70", ">70"]
age_weights = [0.05, 0.1, 0.15, 0.15, 0.1, 0.1, 0.1, 0.1, 0.05, 0.1]

schemes = [
    "Old-age and Survivors Insurance (OASI)", "Disability Insurance (DI)",
    "Unemployment Insurance (UI)", "Family Allowances (FA)",
    "Health/Accident Insurance (LAMal/LAA)", "Occupational Pension Plan (BVG)"
]
scheme_weights = [0.3, 0.2, 0.15, 0.1, 0.15, 0.1]

assistance_issues = [
    "Pending payment", "Benefit issue", "Disabled assistance",
    "Demand for bribe", "Help applying", "Website/Application trouble", "Other reason",
    "Missing approval SMS", "Form issue", "Document/account issue"
]
issue_weights = [0.15, 0.12, 0.1, 0.05, 0.1, 0.1, 0.08, 0.1, 0.1, 0.1]

canton_names = list(canton_districts.keys())
canton_weights = [random.uniform(0.5, 2) for _ in canton_names]

# Date range
start_date = datetime.strptime("10.10.2024", "%d.%m.%Y")
end_date = datetime.strptime("31.01.2025", "%d.%m.%Y")
date_list = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]

# Configuration
total_records = 218828
repeat_percentage = 0.35
repeat_records = int(total_records * repeat_percentage)
unique_records = total_records - repeat_records
incomplete_percentage = 0.40
incomplete_records = int(total_records * incomplete_percentage)

sequential_fields = ["Language", "Canton", "District", "Gender", "Age Group", "Scheme", "Issue"]

# Data generation
data = []
unique_persons = []

# Unique records
for _ in range(unique_records):
    name = fake.name()
    mobile = "+41 " + fake.msisdn()[3:5] + " " + fake.msisdn()[5:7] + " " + fake.msisdn()[7:9] + " " + fake.msisdn()[9:11]
    canton = random.choices(canton_names, weights=canton_weights, k=1)[0]
    district = random.choice(canton_districts[canton])
    gender = random.choices(genders, weights=gender_weights, k=1)[0]
    age = random.choices(ages, weights=age_weights, k=1)[0]
    scheme = random.choices(schemes, weights=scheme_weights, k=1)[0]
    issue = random.choices(assistance_issues, weights=issue_weights, k=1)[0]
    language = "English"
    datetime_value = random.choice(date_list) + timedelta(seconds=random.randint(0, 86399))
    timestamp = datetime_value.strftime("%d.%m.%Y %H:%M:%S")

    record = {
        "Name": name,
        "Phone": mobile,
        "Language": language,
        "Canton": canton,
        "District": district,
        "Gender": gender,
        "Age Group": age,
        "Scheme": scheme,
        "Issue": issue,
        "Timestamp": timestamp
    }

    data.append(record)
    unique_persons.append((name, mobile, canton, district, gender, age))

# Repeated records using faster reuse logic
for _ in range(repeat_records):
    name, mobile, canton, district, gender, age = random.choice(unique_persons)
    scheme = random.choices(schemes, weights=scheme_weights, k=1)[0]
    issue = random.choices(assistance_issues, weights=issue_weights, k=1)[0]
    language = "English"
    datetime_value = random.choice(date_list) + timedelta(seconds=random.randint(0, 86399))
    timestamp = datetime_value.strftime("%d.%m.%Y %H:%M:%S")

    record = {
        "Name": name,
        "Phone": mobile,
        "Language": language,
        "Canton": canton,
        "District": district,
        "Gender": gender,
        "Age Group": age,
        "Scheme": scheme,
        "Issue": issue,
        "Timestamp": timestamp
    }

    data.append(record)

# Shuffle and blank some fields for incomplete records
random.shuffle(data)
df = pd.DataFrame(data)

indices_to_blank = random.sample(range(len(df)), k=incomplete_records)
for idx in indices_to_blank:
    cutoff = random.randint(0, len(sequential_fields))
    for field in sequential_fields[cutoff:]:
        df.at[idx, field] = ""

# Save to Excel
output_path = r"C:\Users\hp\Desktop\Github\Power-BI\1 ChatBot Journey Insights and Performance Dashboard\1 RAW DATA\english_dummy_data.xlsx"
df.to_excel(output_path, sheet_name="Data", index=False)

print(f"Data successfully saved to {output_path}")


Data successfully saved to C:\Users\hp\Desktop\Github\Power-BI\1 ChatBot Journey Insights and Performance Dashboard\1 RAW DATA\english_dummy_data.xlsx


## Missed call data

In [6]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# --- PARAMETERS ---
total_records = 985623
percent_from_files = 0.68
records_from_files = int(total_records * percent_from_files)
records_random = total_records - records_from_files

# --- Excel file path ---
multi_sheet_path = r"C:\Users\hp\Desktop\Github\Power-BI\1 ChatBot Journey Insights and Performance Dashboard\2 DAILY DATA CLEAN PROCESS CHAT BOT DATA\output_file_final.xlsx"

# --- Helper functions ---
def random_time(start_time="00:00:00", end_time="23:59:59"):
    fmt = "%H:%M:%S"
    start = datetime.strptime(start_time, fmt)
    end = datetime.strptime(end_time, fmt)
    delta = end - start
    random_seconds = random.randint(0, int(delta.total_seconds()))
    return (start + timedelta(seconds=random_seconds)).strftime("%H:%M:%S")

def random_date(start_date="10.10.2024", end_date="31.01.2025"):
    fmt = "%d.%m.%Y"
    start = datetime.strptime(start_date, fmt)
    end = datetime.strptime(end_date, fmt)
    delta = end - start
    return (start + timedelta(days=random.randint(0, delta.days))).strftime("%d/%m/%Y")

def format_swiss_number(num_str):
    digits = ''.join(filter(str.isdigit, num_str))[-8:]  # Ensure 8 digits
    return f"+41 {digits[:2]} {digits[2:4]} {digits[4:6]} {digits[6:]}"

# --- Load only 'Mobile Number' column from all sheets ---
sheets_dict = pd.read_excel(multi_sheet_path, sheet_name=None)
caller_dataframes = []

for sheet_name, df in sheets_dict.items():
    if "Mobile Number" in df.columns:
        caller_dataframes.append(df[["Mobile Number"]].dropna())

if not caller_dataframes:
    raise ValueError("No 'Mobile Number' column found in any sheets.")

# Combine all mobile numbers into one DataFrame
all_callers_df = pd.concat(caller_dataframes, ignore_index=True)

# --- Sample 68% from existing mobile numbers (with duplicates allowed) ---
caller_sampled = all_callers_df["Mobile Number"].sample(n=records_from_files, replace=True).reset_index(drop=True)
file_sampled = pd.DataFrame()
file_sampled["CALLER"] = caller_sampled.astype(str).apply(format_swiss_number)
file_sampled["DTMF CODE"] = random.choices(range(0, 10), k=records_from_files)
file_sampled["CALL STATUS"] = random.choices(["Answered", "NotAnswered", "Received"], k=records_from_files)
file_sampled["VIRTUAL NUMBER"] = [f"+41-84556474{random.randint(1000, 9999)}" for _ in range(records_from_files)]
file_sampled["DURATION"] = [f"00:00:{str(random.randint(5, 59)).zfill(2)}" for _ in range(records_from_files)]
file_sampled["DATE"] = [random_date() for _ in range(records_from_files)]
file_sampled["TIME"] = [random_time() for _ in range(records_from_files)]
file_sampled["PULSE COUNT"] = [round(random.uniform(1.0, 3.0), 2) for _ in range(records_from_files)]

# --- Generate 32% with unique numbers not in file_sampled ---
existing_callers_set = set(file_sampled["CALLER"])

def generate_unique_formatted_caller(existing_set):
    while True:
        digits = str(random.randint(10000000, 99999999))  # 8-digit number
        formatted = f"+41 {digits[:2]} {digits[2:4]} {digits[4:6]} {digits[6:]}"
        if formatted not in existing_set:
            existing_set.add(formatted)
            return formatted

unique_callers = [generate_unique_formatted_caller(existing_callers_set) for _ in range(records_random)]

random_data = pd.DataFrame({
    "CALLER": unique_callers,
    "DTMF CODE": random.choices(range(0, 10), k=records_random),
    "CALL STATUS": random.choices(["Answered", "NotAnswered", "Received"], k=records_random),
    "VIRTUAL NUMBER": [f"+41-84556474{random.randint(1000, 9999)}" for _ in range(records_random)],
    "DURATION": [f"00:00:{str(random.randint(5, 59)).zfill(2)}" for _ in range(records_random)],
    "DATE": [random_date() for _ in range(records_random)],
    "TIME": [random_time() for _ in range(records_random)],
    "PULSE COUNT": [round(random.uniform(1.0, 3.0), 2) for _ in range(records_random)]
})

# --- Combine both datasets ---
final_df = pd.concat([file_sampled, random_data], ignore_index=True)

# --- Save to Excel ---
output_path = r"C:\Users\hp\Desktop\Github\Power-BI\1 ChatBot Journey Insights and Performance Dashboard\1 RAW DATA\missed_call_data.xlsx"
final_df.to_excel(output_path, index=False)

print("✅ Dummy data saved to missed_call_data.xlsx")


✅ Dummy data saved to missed_call_data.xlsx


## Missied call data filling

In [10]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta
import os

# Initialize Faker
fake = Faker("en")
Faker.seed(42)
random.seed(42)

# Canton and District Mapping
canton_districts = {
    "Aargau (AG)": ["Aarau", "Baden", "Bremgarten", "Brugg", "Kulm", "Laufenburg", "Lenzburg", "Muri", "Rheinfelden", "Zofingen", "Zurzach"],
    "Appenzell": ["Appenzell", "Gonten", "Oberegg"],
    "Basel-Landschaft": ["Arlesheim", "Laufen", "Liestal", "Sissach", "Waldenburg"],
    "Basel-Stadt (BS)": ["Basel-Stadt"],
    "Bern (BE)": ["Bern-Mittelland", "Biel/Bienne", "Emmental", "Frutigen-Niedersimmental", "Interlaken-Oberhasli", "Jura bernois", "Oberaargau", "Obersimmental-Saanen", "Seeland", "Thun"],
    "Fribourg (FR)": ["Broye", "Glâne", "Gruyère", "Sarine", "Lac", "Singine", "Veveyse"],
    "Geneva (GE)": ["Geneva"],
    "Glarus (GL)": ["Glarus"],
    "Grisons (GR)": ["Albula", "Bernina", "Engadine Basse/Val Müstair", "Imboden", "Landquart", "Maloja", "Moesa", "Plessur", "Prättigau/Davos", "Surselva", "Viamala"],
    "Jura (JU)": ["Delémont", "Porrentruy", "Franches-Montagnes"],
    "Lucerne (LU)": ["Entlebuch", "Hochdorf", "Lucerne-Countryside", "Lucerne-City", "Sursee", "Willisau"],
    "Neuchâtel (NE)": ["Boudry", "La Chaux-de-Fonds", "Le Locle", "Neuchâtel", "Val-de-Ruz", "Val-de-Travers"],
    "Nidwalden (NW)": ["Nidwalden"],
    "Obwalden (OW)": ["Obwalden"],
    "Schaffhausen (SH)": ["Stein", "Schaffhausen", "Schleitheim"],
    "Schwyz (SZ)": ["Einsiedeln", "Gersau", "Höfe", "Küssnacht", "March", "Schwyz"],
    "Solothurn (SO)": ["Wasseramt-Bucheggberg", "Dorneck-Thierstein", "Thal-Gäu", "Olten-Gösgen", "Solothurn-Lebern"],
    "St. Gallen (SG)": ["Rheintal", "Rorschach", "Sarganserland", "See-Gaster", "St. Gallen", "Toggenburg", "Werdenberg", "Wil"],
    "Thurgau (TG)": ["Arbon", "Frauenfeld", "Kreuzlingen", "Münchwilen", "Weinfelden"],
    "Ticino (TI)": ["Bellinzona", "Blenio", "Leventina", "Locarno", "Lugano", "Mendrisio", "Riviera", "Vallemaggia"],
    "Uri (UR)": ["Uri"],
    "Valais (VS)": ["Brig", "Conthey", "Entremont", "Goms", "Hérens", "Leuk", "Martigny", "Monthey", "Saint-Maurice", "Sierre", "Sion", "Visp", "Raron"],
    "Vaud (VD)": ["Aigle", "Broye-Vully", "Gros-de-Vaud", "Jura-Nord vaudois", "Lausanne", "Lavaux-Oron", "Morges", "Nyon", "Riviera-Pays-d’Enhaut", "West Lausanne"],
    "Zug (ZG)": ["Zug"],
    "Zurich (ZH)": ["Andelfingen", "Affoltern", "Bülach", "Dielsdorf", "Dietikon", "Hinwil", "Horgen", "Meilen", "Pfäffikon", "Uster", "Winterthur", "Zurich"]
}

# Category definitions
genders = ["Female", "Male", "Other"]
gender_weights = [0.56, 0.42, 0.02]

ages = ["16–22", "23–28", "29–34", "35–40", "41–46", "47–52", "53–58", "59–64", "65–70", ">70"]
age_weights = [0.05, 0.1, 0.15, 0.15, 0.1, 0.1, 0.1, 0.1, 0.05, 0.1]

schemes = [
    "Old-age and Survivors Insurance (OASI)", "Disability Insurance (DI)",
    "Unemployment Insurance (UI)", "Family Allowances (FA)",
    "Health/Accident Insurance (LAMal/LAA)", "Occupational Pension Plan (BVG)"
]
scheme_weights = [0.3, 0.2, 0.15, 0.1, 0.15, 0.1]

issues = [
    "Pending payment", "Benefit issue", "Disabled assistance", "Demand for bribe",
    "Help applying", "Website/Application trouble", "Other reason",
    "Missing approval SMS", "Form issue", "Document/account issue"
]
issue_weights = [0.15, 0.12, 0.1, 0.05, 0.1, 0.1, 0.08, 0.1, 0.1, 0.1]

# Load missed call data
input_file = r"C:\Users\hp\Desktop\Github\Power-BI\1 ChatBot Journey Insights and Performance Dashboard\1 RAW DATA\missed_call_data.xlsx"
call_data = pd.read_excel(input_file)

# Prepare phone numbers and timestamps
phone_numbers = call_data["CALLER"].tolist()
timestamps = pd.to_datetime(call_data["DATE"].astype(str) + " " + call_data["TIME"].astype(str), format="%d/%m/%Y %H:%M:%S")


# Define total records to generate
total_records = 218828
repeat_percentage = 0.35
repeat_records = int(total_records * repeat_percentage)
unique_records = total_records - repeat_records

# Cantons weighted
canton_names = list(canton_districts.keys())
canton_weights = [random.uniform(0.5, 2) for _ in canton_names]

# Generate unique records
data = []
unique_persons = []
for i in range(unique_records):
    name = fake.name()
    phone = phone_numbers[i % len(phone_numbers)]
    timestamp = timestamps[i % len(timestamps)]
    canton = random.choices(canton_names, weights=canton_weights, k=1)[0]
    district = random.choice(canton_districts[canton])
    gender = random.choices(genders, weights=gender_weights, k=1)[0]
    age = random.choices(ages, weights=age_weights, k=1)[0]
    scheme = random.choices(schemes, weights=scheme_weights, k=1)[0]
    issue = random.choices(issues, weights=issue_weights, k=1)[0]
    language = "English"

    row = {
        "Name": name,
        "Phone": phone,
        "Language": language,
        "Canton": canton,
        "District": district,
        "Gender": gender,
        "Age Group": age,
        "Scheme": scheme,
        "Issue": issue,
        "Timestamp": timestamp.strftime("%d.%m.%Y %H:%M:%S")
    }

    data.append(row)
    unique_persons.append((name, phone))

# Generate repeat records
for i in range(repeat_records):
    name, phone = random.choice(unique_persons)
    person_entries = [row for row in data if row["Name"] == name and row["Phone"] == phone]
    base = random.choice(person_entries) if person_entries else {}

    canton = base.get("Canton", random.choice(canton_names))
    district = base.get("District", random.choice(canton_districts[canton]))
    gender = base.get("Gender", random.choices(genders, weights=gender_weights, k=1)[0])
    age = base.get("Age Group", random.choices(ages, weights=age_weights, k=1)[0])
    scheme = random.choices(schemes, weights=scheme_weights, k=1)[0]
    issue = random.choices(issues, weights=issue_weights, k=1)[0]
    language = "English"
    timestamp = random.choice(timestamps)

    row = {
        "Name": name,
        "Phone": phone,
        "Language": language,
        "Canton": canton,
        "District": district,
        "Gender": gender,
        "Age Group": age,
        "Scheme": scheme,
        "Issue": issue,
        "Timestamp": timestamp.strftime("%d.%m.%Y %H:%M:%S")
    }

    data.append(row)

# Shuffle and create DataFrame
random.shuffle(data)
df = pd.DataFrame(data)

# Export to Excel
base_path = r"C:\Users\hp\Desktop\Github\Power-BI\1 ChatBot Journey Insights and Performance Dashboard\1 RAW DATA"
output_path = os.path.join(base_path, "missed_call_data_with_details.xlsx")
df.to_excel(output_path, sheet_name="Data", index=False)

print(f"Data successfully saved to {output_path}")


Data successfully saved to C:\Users\hp\Desktop\Github\Power-BI\1 ChatBot Journey Insights and Performance Dashboard\1 RAW DATA\missed_call_data_with_details.xlsx


## Internal call center

In [13]:
import pandas as pd
import random
from datetime import datetime, timedelta
from faker import Faker

fake = Faker()

# === File Paths ===
input_file = r"C:\Users\hp\Desktop\Github\Power-BI\1 ChatBot Journey Insights and Performance Dashboard\2 DAILY DATA CLEAN PROCESS CHAT BOT DATA\output_file_final.xlsx"
missed_call_data_file = r"C:\Users\hp\Desktop\Github\Power-BI\1 ChatBot Journey Insights and Performance Dashboard\1 RAW DATA\missed_call_data_with_details.xlsx"
output_file = r"C:\Users\hp\Desktop\Github\Power-BI\1 ChatBot Journey Insights and Performance Dashboard\1 RAW DATA\internal_call_center_data.xlsx"

# === Load and Validate Data ===
all_sheets = pd.read_excel(input_file, sheet_name=None)
combined_df = pd.concat(all_sheets.values(), ignore_index=True)

required_columns = [
    'Name', 'Mobile Number', 'Age Group', 'Language', 'Gender',
    'Canton', 'District', 'Scheme', 'Issue', 'Chat Date'
]
missing_cols = [col for col in required_columns if col not in combined_df.columns]
if missing_cols:
    raise ValueError(f"Missing required columns: {', '.join(missing_cols)}")
if 'Call center' not in combined_df.columns:
    raise ValueError("Missing 'Call center' column")

# === Filter and Sample ===
filtered_df = combined_df[combined_df['Call center'].str.lower() == 'our call center'].copy()
if len(filtered_df) < 159126:
    raise ValueError("Not enough rows to sample 159,126 calls.")
sampled_df = filtered_df.sample(n=159126, random_state=42).reset_index(drop=True)
sampled_df['Data From'] = 'chat data'

missed_call_df = pd.read_excel(missed_call_data_file)
missed_call_df.rename(columns={'Phone': 'Mobile Number'}, inplace=True)

## missed_call_df.rename(columns={'Phone': 'Mobile Number', 'Timestamp': 'Chat Date'}, inplace=True)

if len(missed_call_df) < 75448:
    raise ValueError("Not enough rows in missed call data to sample 75,448 calls.")
missed_sampled_df = missed_call_df.sample(n=75448, random_state=42).reset_index(drop=True)
missed_sampled_df['Data From'] = 'missed call'

# === Combine Datasets ===
combined_sampled_df = pd.concat([sampled_df, missed_sampled_df], ignore_index=True)

# === Agent Names (Weighted Assignment) ===
agents = [f"SWZ{str(i+1).zfill(3)}_{fake.first_name()}" for i in range(25)]
agent_weights = [random.uniform(0.5, 1.5) for _ in agents]
combined_sampled_df['Agent Name'] = random.choices(agents, weights=agent_weights, k=len(combined_sampled_df))

# === Define Questions and Comments ===
enquiry_questions = [
    "How can I get help applying for OASI or DI benefits?",
    "Can I get assistance filling out the form for family allowance?",
    "What if I don’t have all the required documents to apply for disability insurance?",
    "How can I track the status of my unemployment benefits?",
    "Can someone help me apply if I don’t speak a Swiss national language?",
    "How can I correct a bank account error for my health insurance subsidy payment?",
    "What are the steps to apply for accident insurance under LAA?",
    "Is it possible to reapply for DI if my previous claim was rejected due to documentation issues?",
    "My account details changed—how do I update them across all my registered schemes?",
    "I’m confused between LAMal and LAA—how can I get advice on which applies to me?",
    "Is there an online portal where I can apply for multiple Swiss social schemes at once?",
    "What happens if I miss a submission deadline for updating my OASI documents?",
    "Can I apply for unemployment benefits if I quit my job voluntarily?",
    "Is it possible to get retroactive payments if my claim was approved late?",
    "What support exists for people with disabilities to complete their applications?",
    "My occupational pension shows a different balance than expected—how can I dispute this?",
    "Are there regional offices to help with benefit applications and documentation?",
    "Can I still get OASI benefits if my payment was delayed due to a wrong IBAN?",
    "I received a rejection letter for family allowance—can I appeal or correct it?"
]
enquiry_resolutions = ["resolution provided", "follow-up required"]
enquiry_comments = ["Complete solve", "Already resolved", "Lack of documents", "Money received", "Complaint solved", "Issue already solved", "Call back 2 PM", "Call back 5 PM", "Call back 6 PM", "Problem solved"] 

complaint_questions = [
    "What should I do if I uploaded the wrong documents while applying for unemployment insurance?",
    "Why is my OASI payment still pending even though I applied months ago?",
    "Why am I receiving a reduced amount from my occupational pension (BVG)?",
    "Who do I contact if my family allowance payment hasn’t arrived?",
    "I submitted all documents but still haven’t received any benefits—what could be the issue?",
    "How do I file a complaint about a delay in my DI benefit?"
]
complaint_comments = ["Call escalate", "Approval not received"]

short_comments = [
    "Voice issue", "Customer busy", "Hold call back", "Driving", "Call back busy", "Not interested", 
    "System issue", "Voice break", "Wrong number", "Call back voice issue", 
    "Customer not available", "Call back no response", "Call back later"
]

unanswered_comments_map = {
    'NOT-ANSWERED': 'No response',
    'MISSED': 'Call disconnected',
    'ABANDONED': 'Call back no response'
}
unanswered_statuses = list(unanswered_comments_map.keys())

# === Helper Functions ===
def generate_daytime_time_weighted():
    weighted_hours = [9]*1 + [10]*3 + [11]*3 + [12]*2 + [13]*3 + [14]*3 + [15]*2 + [16]*2 + [17]*1 + [18]*1
    hour = random.choice(weighted_hours)
    minute = random.randint(0, 59)
    second = random.randint(0, 59)
    return datetime.strptime(f"{hour}:{minute}:{second}", "%H:%M:%S").time()

def format_duration(min_sec_range=(180, 720)):
    total_seconds = random.randint(*min_sec_range)
    minutes = total_seconds // 60
    seconds = total_seconds % 60
    return f"{minutes:02d}:{seconds:02d}"

def generate_call_datetime_skewed(chat_date):
    call_time = generate_daytime_time_weighted()
    base_date = pd.to_datetime(chat_date).date()
    skew_days = int(abs(random.gauss(mu=5, sigma=3)))  # Skewed date
    final_dt = datetime.combine(base_date, call_time) + timedelta(days=skew_days)
    return final_dt.strftime("%d.%m.%Y %H:%M:%S")

# === RANDOM CALL TYPE ASSIGNMENT ===
call_types_pool = ['full_flow'] * round(0.62 * len(combined_sampled_df)) + \
                  ['short'] * round(0.15 * len(combined_sampled_df)) + \
                  ['unanswered'] * (len(combined_sampled_df) - round(0.62 * len(combined_sampled_df)) - round(0.15 * len(combined_sampled_df)))
random.shuffle(call_types_pool)
combined_sampled_df['Call Type'] = call_types_pool

# === Prepare for Assignment ===
combined_sampled_df['Duration'] = ''
combined_sampled_df['Dialer Status'] = ''
combined_sampled_df['Dialer Comments'] = ''
combined_sampled_df['Question'] = ''
combined_sampled_df['Resolution'] = ''

# Weighted question pool
all_questions = enquiry_questions + complaint_questions
weights = [random.uniform(0.5, 2.0) for _ in all_questions]
total_weight = sum(weights)
normalized_weights = [w / total_weight for w in weights]

# === Populate Call Data ===
for idx, row in combined_sampled_df.iterrows():
    call_type = row['Call Type']
    if call_type == 'full_flow':
        is_complaint = random.random() < 0.23
        combined_sampled_df.at[idx, 'Call Type'] = 'Complaint' if is_complaint else 'Enquiry'
        combined_sampled_df.at[idx, 'Duration'] = format_duration((180, 720))
        combined_sampled_df.at[idx, 'Dialer Comments'] = 'Call escalate' if is_complaint else random.choice(enquiry_comments)
        combined_sampled_df.at[idx, 'Question'] = random.choices(all_questions, weights=normalized_weights, k=1)[0]
        combined_sampled_df.at[idx, 'Resolution'] = 'Escalation needed' if is_complaint else random.choice(enquiry_resolutions)
        combined_sampled_df.at[idx, 'Dialer Status'] = 'ANSWERED'
    elif call_type == 'short':
        combined_sampled_df.at[idx, 'Duration'] = format_duration((0, 120))
        combined_sampled_df.at[idx, 'Dialer Comments'] = random.choice(short_comments)
        combined_sampled_df.at[idx, 'Dialer Status'] = 'ANSWERED'
    elif call_type == 'unanswered':
        status = random.choice(unanswered_statuses)
        combined_sampled_df.at[idx, 'Dialer Status'] = status
        combined_sampled_df.at[idx, 'Duration'] = '00:00'
        combined_sampled_df.at[idx, 'Dialer Comments'] = unanswered_comments_map[status]

# === Final Touches ===
combined_sampled_df['Call DateTime'] = combined_sampled_df['Chat Date'].apply(generate_call_datetime_skewed)

output_columns = [
    'Name', 'Mobile Number', 'Age Group', 'Gender', 'Language', 'Canton', 'District',
    'Scheme', 'Issue', 'Chat Date', 'Dialer Status', 'Duration', 'Dialer Comments',
    'Call Type', 'Question', 'Resolution', 'Call DateTime', 'Agent Name', 'Data From'
]
final_df = combined_sampled_df[output_columns]

# === Export to Excel ===
final_df.to_excel(output_file, index=False)
print("✅ Output saved to:", output_file)


✅ Output saved to: C:\Users\hp\Desktop\Github\Power-BI\1 ChatBot Journey Insights and Performance Dashboard\1 RAW DATA\internal_call_center_data.xlsx


## MLA office

In [14]:
import pandas as pd
import random
from datetime import datetime, timedelta
from faker import Faker

# ----------------------------
# CONFIGURATION
# ----------------------------
INPUT_FILE = r"C:\Users\hp\Desktop\Github\Power-BI\1 ChatBot Journey Insights and Performance Dashboard\2 DAILY DATA CLEAN PROCESS CHAT BOT DATA\output_file_final.xlsx"
OUTPUT_FILE = r"C:\Users\hp\Desktop\Github\Power-BI\1 ChatBot Journey Insights and Performance Dashboard\1 RAW DATA\MLA_call_center_data.xlsx"
REQUIRED_COLUMNS = ['Name', 'Mobile Number', 'Age Group', 'Language', 'Gender', 'Canton', 'District', 'Scheme', 'Issue', 'Chat Date','Chat Time']
SAMPLE_FRACTION = 0.48

# ----------------------------
# LOAD AND VALIDATE DATA
# ----------------------------
all_sheets = pd.read_excel(INPUT_FILE, sheet_name=None)
combined_df = pd.concat(all_sheets.values(), ignore_index=True)

for col in REQUIRED_COLUMNS:
    if col not in combined_df.columns:
        raise ValueError(f"Missing required column: {col}")

filtered_df = combined_df[combined_df['Call center'].str.lower() == 'mla/party office']
sampled_df = filtered_df.sample(frac=SAMPLE_FRACTION, random_state=42).reset_index(drop=True)

# ----------------------------
# SPLIT DATA INTO CATEGORIES
# ----------------------------
total = len(sampled_df)
long_count = round(total * 0.36)
short_count = round(total * 0.25)
other_count = round(total * 0.21)
unanswered_count = total - (long_count + short_count + other_count)

long_df = sampled_df.iloc[:long_count].copy()
short_df = sampled_df.iloc[long_count:long_count+short_count].copy()
other_df = sampled_df.iloc[long_count+short_count:long_count+short_count+other_count].copy()
unanswered_df = sampled_df.iloc[long_count+short_count+other_count:].copy()

# ----------------------------
# HELPERS
# ----------------------------
def random_duration(min_sec, max_sec):
    total = random.randint(min_sec, max_sec)
    mins, secs = divmod(total, 60)
    return f"{mins:02d}:{secs:02d}"

def random_time_window():
    start = datetime.strptime('09:00:00', '%H:%M:%S')
    end = datetime.strptime('18:00:00', '%H:%M:%S')
    delta = end - start
    return (start + timedelta(seconds=random.randint(0, int(delta.total_seconds())))).time()

def generate_call_datetime(base_date):
    call_time = random_time_window()
    days_to_add = random.randint(1, 10)  # wider spread
    full_datetime = datetime.combine(base_date.date(), call_time) + timedelta(days=days_to_add)
    return full_datetime.strftime("%d.%m.%Y %H:%M:%S")

# ----------------------------
# QUESTION POOLS
# ----------------------------
long_questions = [
    "Can someone help me complete the application online?",
    "I’m disabled—can someone help me fill out the form?",
    "Is there a special scheme or priority support for disabled individuals?",
    "Can someone come to my home and help with the application?",
    "I can't read or type—how can I apply?",
    "I have a disability certificate—how do I use it for support?",
    "Are there accessible versions of the forms (large print, audio, etc)?",
    "Can my caretaker or relative apply on my behalf?",
    "Will I get any extra benefits or support because of my disability?",
    "Is there a toll-free number I can call for help with this?",
    "Do I need to upload my disability proof online, or can I submit it later?"
]
short_questions = [
    "Why is the website not opening on my phone/computer?",
    "I can't log in to my account—what should I do?",
    "The form is not submitting even after I fill everything—why?",
    "I uploaded my documents but it says “error”—what does that mean?",
    "Is there a simpler version of the site for mobile users?",
    "The page keeps loading and nothing happens—what’s wrong?",
    "Why does the website keep logging me out automatically?",
    "I'm not receiving the OTP or confirmation email—what can I do?",
    "Why is my application still pending after so many days?",
    "I didn’t receive any approval SMS or update—what should I do?"
]
question_pool = long_questions + short_questions
question_weights = [10]*5 + [6]*5 + [4]*6 + [2]*5

# ----------------------------
# LONG CALLS
# ----------------------------
long_comments = [
    "Complete solve", "Already resolved", "Money received", "Lack of documents",
    "Complaint solved", "Issue already solved", "Call back 2 PM",
    "Call back 5 PM", "Call back 6 PM", "Problem solved"
]
long_df['Dialer Status'] = 'ANSWERED'
long_df['Duration'] = [random_duration(180, 720) for _ in range(len(long_df))]
long_df['Dialer Comments'] = random.choices(long_comments, k=len(long_df))
long_df['Call Type'] = random.choices(["Enquiry", "Others"], k=len(long_df))
long_df['Question'] = random.choices(question_pool, weights=question_weights, k=len(long_df))
long_df['Resolution'] = ["Resolution Provided"] * len(long_df)

# ----------------------------
# SHORT CALLS
# ----------------------------
short_df['Dialer Status'] = 'ANSWERED'
short_df['Duration'] = [random_duration(180, 480) for _ in range(len(short_df))]
short_df['Dialer Comments'] = ["Call escalate"] * len(short_df)
short_df['Call Type'] = random.choices(["Complaint", "Others"], k=len(short_df))
short_df['Question'] = random.choices(question_pool, weights=question_weights, k=len(short_df))
short_df['Resolution'] = ["Follow-up or Escalation Needed"] * len(short_df)

# ----------------------------
# OTHER ANSWERED
# ----------------------------
other_comments = [
    "Voice issue", "Customer busy", "Hold call back", "Driving", "Call back busy",
    "Not interested", "System issue", "Voice break", "Wrong number", "Call back voice issue",
    "Customer not available", "Call back no response", "Call back later"
]
other_df['Dialer Status'] = 'ANSWERED'
other_df['Duration'] = [random_duration(60, 120) for _ in range(len(other_df))]
other_df['Dialer Comments'] = random.choices(other_comments, k=len(other_df))
other_df[['Call Type', 'Question', 'Resolution']] = ''

# ----------------------------
# UNANSWERED CALLS
# ----------------------------
unanswered_statuses = ['NOT-ANSWERED', 'MISSED', 'ABANDONED']
unanswered_comments = {
    'NOT-ANSWERED': 'No response',
    'MISSED': 'Call disconnected',
    'ABANDONED': 'Call back no response'
}
statuses = random.choices(unanswered_statuses, k=len(unanswered_df))
unanswered_df['Dialer Status'] = statuses
unanswered_df['Duration'] = '00:00'
unanswered_df['Dialer Comments'] = [unanswered_comments[status] for status in statuses]
unanswered_df[['Call Type', 'Question', 'Resolution']] = ''

# ----------------------------
# FINAL COMBINATION
# ----------------------------
final_df = pd.concat([long_df, short_df, other_df, unanswered_df], ignore_index=True)

# Generate Timestamp and Call DateTime
final_df['Timestamp'] = pd.to_datetime(final_df['Chat Date'].astype(str) + ' ' + final_df['Chat Time'].astype(str))
final_df['Call DateTime'] = final_df['Timestamp'].apply(lambda ts: generate_call_datetime(ts))

# Generate Agents
fake = Faker()
agents = [f"SWZ{str(i+1).zfill(3)}_{fake.first_name()}" for i in range(10)]
agent_weights = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
final_df['Agent Name'] = random.choices(agents, weights=agent_weights, k=len(final_df))

# Final Columns
output_columns = [
    'Name', 'Mobile Number', 'Age Group', 'Gender', 'Language', 'Canton', 'District',
    'Scheme', 'Issue', 'Chat Date', 'Chat Time', 'Dialer Status', 'Duration', 'Dialer Comments',
    'Call Type', 'Question', 'Resolution', 'Call DateTime', 'Agent Name'
]
final_df = final_df[output_columns]

# ----------------------------
# SAVE TO EXCEL
# ----------------------------
final_df.to_excel(OUTPUT_FILE, index=False)
print("✅ Output saved with varied call data, questions, agents, and timestamps.")


✅ Output saved with varied call data, questions, agents, and timestamps.


## Division office

In [15]:
import pandas as pd
import random
from datetime import datetime, timedelta
from faker import Faker

# ----------------------------
# CONFIGURATION
# ----------------------------
INPUT_FILE = r"C:\Users\hp\Desktop\Github\Power-BI\1 ChatBot Journey Insights and Performance Dashboard\2 DAILY DATA CLEAN PROCESS CHAT BOT DATA\output_file_final.xlsx"
OUTPUT_FILE = r"C:\Users\hp\Desktop\Github\Power-BI\1 ChatBot Journey Insights and Performance Dashboard\1 RAW DATA\Division_call_center_data.xlsx"
REQUIRED_COLUMNS = ['Name', 'Mobile Number', 'Age Group', 'Language', 'Gender', 'Canton', 'District', 'Scheme', 'Issue', 'Chat Date', 'Chat Time']
SAMPLE_FRACTION = 0.56

# ----------------------------
# LOAD AND VALIDATE DATA
# ----------------------------
all_sheets = pd.read_excel(INPUT_FILE, sheet_name=None)
combined_df = pd.concat(all_sheets.values(), ignore_index=True)

for col in REQUIRED_COLUMNS:
    if col not in combined_df.columns:
        raise ValueError(f"Missing required column: {col}")

filtered_df = combined_df[combined_df['Call center'].str.lower() == 'division office']
sampled_df = filtered_df.sample(frac=SAMPLE_FRACTION, random_state=42).reset_index(drop=True)

# ----------------------------
# SPLIT DATA INTO CATEGORIES
# ----------------------------
actual_total = len(sampled_df)
long_count = round(actual_total * 0.36)
short_count = round(actual_total * 0.25)
other_answered_count = round(actual_total * 0.21)
unanswered_count = actual_total - (long_count + short_count + other_answered_count)

long_call_df = sampled_df.iloc[:long_count].copy()
short_call_df = sampled_df.iloc[long_count:long_count + short_count].copy()
other_call_df = sampled_df.iloc[long_count + short_count:long_count + short_count + other_answered_count].copy()
unanswered_df = sampled_df.iloc[long_count + short_count + other_answered_count:].copy()

# ----------------------------
# HELPERS
# ----------------------------
def random_duration(min_sec, max_sec):
    total = random.randint(min_sec, max_sec)
    mins, secs = divmod(total, 60)
    return f"{mins:02d}:{secs:02d}"

def random_time_window():
    start = datetime.strptime('09:00:00', '%H:%M:%S')
    end = datetime.strptime('18:00:00', '%H:%M:%S')
    random_seconds = random.randint(0, int((end - start).total_seconds()))
    return (start + timedelta(seconds=random_seconds)).time()

def generate_call_datetime(date_str, time_str):
    base_date = pd.to_datetime(f"{date_str} {time_str}")
    call_time = random_time_window()
    days_to_add = random.randint(3, 5)
    final_dt = datetime.combine(base_date.date(), call_time) + timedelta(days=days_to_add)
    return final_dt.strftime("%d.%m.%Y %H:%M:%S")

# ----------------------------
# LONG CALLS
# ----------------------------
long_questions = [
    "I submitted the wrong form—can I reapply or edit it?",
    "I completed the process 10 days ago but haven’t received any SMS—what’s the delay?",
    "My neighbors got the approval message but I haven’t received anything—am I rejected?",
    "Is there a way to manually check my approval status if I didn’t get an SMS?",
    "Can the approval SMS be resent if I deleted it or didn’t get it?",
    "What number or sender name should I look for in the approval SMS?",
    "The online form is not opening—can you help me with an alternative?",
    "My form was rejected but I didn’t understand the reason—can you explain?",
    "Which documents are mandatory while submitting the form?",
    "The portal says ‘invalid format’—what should I change in the form?"
]
long_comments = [
    "Complete solve", "Already resolved", "Money received", "Lack of documents",
    "Complaint solved", "Issue already solved", "Call back 2 PM",
    "Call back 5 PM", "Call back 6 PM", "Problem solved"
]

long_call_df['Dialer Status'] = 'ANSWERED'
long_call_df['Duration'] = [random_duration(180, 720) for _ in range(len(long_call_df))]
long_call_df['Dialer Comments'] = random.choices(long_comments, k=len(long_call_df))
long_call_df['Call Type'] = random.choices(["Enquiry", "Others"], k=len(long_call_df))

# Weighted random distribution for questions
long_call_df['Question'] = random.choices(long_questions, k=len(long_call_df), weights=[0.1, 0.2, 0.1, 0.15, 0.05, 0.05, 0.1, 0.1, 0.05, 0.1])
long_call_df['Resolution'] = ["resolution provided"] * len(long_call_df)

# ----------------------------
# SHORT CALLS
# ----------------------------
short_questions = [
    "An official asked me for money to approve my application—what should I do?",
    "I was told I need to pay extra to get faster approval. Is that allowed?",
    "How can I report a staff member who demanded a bribe?",
    "Is there a helpline or email to report corruption confidentially?",
    "What action will be taken if someone is found asking for bribes?"
]
short_call_df['Dialer Status'] = 'ANSWERED'
short_call_df['Duration'] = [random_duration(180, 720) for _ in range(len(short_call_df))]
short_call_df['Dialer Comments'] = ["Call escalate"] * len(short_call_df)
short_call_df['Call Type'] = random.choices(["Complaint", "Others"], k=len(short_call_df))
short_call_df['Question'] = random.choices(short_questions, k=len(short_call_df))
short_call_df['Resolution'] = ["Follow-up or Escalation Needed"] * len(short_call_df)

# ----------------------------
# OTHER ANSWERED CALLS
# ----------------------------
other_comments = [
    "Voice issue", "Customer busy", "Hold call back", "Driving", "Call back busy",
    "Not interested", "System issue", "Voice break", "Wrong number", "Call back voice issue",
    "Customer not available", "Call back no response", "Call back"
]
other_call_df['Dialer Status'] = 'ANSWERED'
other_call_df['Duration'] = [random_duration(30, 120) for _ in range(len(other_call_df))]
other_call_df['Dialer Comments'] = random.choices(other_comments, k=len(other_call_df))
other_call_df[['Call Type', 'Question', 'Resolution']] = ''

# ----------------------------
# UNANSWERED CALLS
# ----------------------------
unanswered_statuses = ['NOT-ANSWERED', 'MISSED', 'ABANDONED']
unanswered_comments = {
    'NOT-ANSWERED': 'No response',
    'MISSED': 'Call disconnected',
    'ABANDONED': 'Call back no response'
}
statuses = random.choices(unanswered_statuses, k=len(unanswered_df))
unanswered_df['Dialer Status'] = statuses
unanswered_df['Duration'] = '00:00'
unanswered_df['Dialer Comments'] = [unanswered_comments[status] for status in statuses]
unanswered_df[['Call Type', 'Question', 'Resolution']] = ''

# ----------------------------
# AGENT DISTRIBUTION (Varied counts)
# ----------------------------
fake = Faker()
agents = [f"SWZ{str(i+1).zfill(3)}_{fake.first_name()}" for i in range(10)]

# Randomly assign agents with varied counts
agent_counts = [random.randint(1, 10) for _ in range(10)]
agent_pool = sum([[agents[i]] * agent_counts[i] for i in range(10)], [])

# Assign agents
long_call_df['Agent Name'] = random.choices(agent_pool, k=len(long_call_df))
short_call_df['Agent Name'] = random.choices(agent_pool, k=len(short_call_df))
other_call_df['Agent Name'] = random.choices(agent_pool, k=len(other_call_df))
unanswered_df['Agent Name'] = random.choices(agent_pool, k=len(unanswered_df))

# ----------------------------
# COMBINE AND FINALIZE DATA
# ----------------------------
final_df = pd.concat([long_call_df, short_call_df, other_call_df, unanswered_df], ignore_index=True)
final_df['Call DateTime'] = final_df.apply(lambda row: generate_call_datetime(row['Chat Date'], row['Chat Time']), axis=1)

# Add final column order
output_columns = [
    'Name', 'Mobile Number', 'Age Group', 'Gender', 'Language', 'Canton', 'District',
    'Scheme', 'Issue', 'Chat Date', 'Chat Time', 'Dialer Status', 'Duration',
    'Dialer Comments', 'Call Type', 'Question', 'Resolution', 'Call DateTime', 'Agent Name'
]
final_df = final_df[output_columns]

# ----------------------------
# SAVE TO EXCEL
# ----------------------------
final_df.to_excel(OUTPUT_FILE, index=False)
print("✅ Output saved with categorized data and Agent Name column.")


✅ Output saved with categorized data and Agent Name column.
