In [13]:
!pip install faker==19.6.2 tqdm langcodes -q

In [14]:
import pandas as pd, random, re, numpy as np
from faker import Faker
from tqdm.auto import tqdm
tqdm.pandas()

orig_path = "master_airline_dataset.csv" 
df = pd.read_csv(orig_path)

In [15]:
def clean_text(text):
    return re.sub(r"[^\x00-\x7F]+", "", str(text))
df["customer_review"] = df["customer_review"].apply(clean_text)

templates = {
    "es": [
        "El vuelo con {airline} fue {adj}. {extra}",
        "Servicio {adj} a bordo de {airline}. {extra}"
    ],
    "fr": [
        "Le vol avec {airline} était {adj}. {extra}",
        "{airline} offre un service {adj}. {extra}"
    ],
    "nl": [
        "De vlucht met {airline} was {adj}. {extra}",
        "{airline} bood een {adj} ervaring. {extra}"
    ],
    "de": [
        "Der Flug mit {airline} war {adj}. {extra}",
        "{airline} bot einen {adj} Service. {extra}"
    ],
    "it": [
        "Il volo con {airline} è stato {adj}. {extra}",
        "{airline} offre un servizio {adj}. {extra}"
    ]
}
adjs = {
    "es": ["excelente", "aceptable", "mediocre"],
    "fr": ["excellent", "correct", "médiocre"],
    "nl": ["uitstekend", "redelijk", "matig"],
    "de": ["ausgezeichnet", "in Ordnung", "mittelmäßig"],
    "it": ["eccellente", "discreto", "scarso"]
}
extras = {
    "es": ["Recomiendo esta aerolínea.", "Definitivamente volvería a volar."],
    "fr": ["Je la recommande.", "Je voyagerai de nouveau."],
    "nl": ["Ik vlieg graag opnieuw.", "Aanrader!"],
    "de": ["Würde wieder fliegen.", "Kann ich empfehlen."],
    "it": ["La consiglio.", "Volerei di nuovo."]
}
airlines_pool = ["Lufthansa", "Air France", "KLM", "Emirates", "Qatar Airways"]

lang_counts = {"es":36594, "fr":22389, "nl":19284, "de":46783, "it":27895}

fake = Faker()
synthetic_rows = []

for lang, n_rows in lang_counts.items():
    for _ in tqdm(range(n_rows), desc=f"Generating {lang}"):
        tpl = random.choice(templates[lang])
        review = tpl.format(
            airline=random.choice(airlines_pool),
            adj=random.choice(adjs[lang]),
            extra=random.choice(extras[lang])
        )
        synthetic_rows.append({
            "airline": random.choice(airlines_pool),
            "customer_review": review,
            "recommended": random.choice(["yes","no"]),
            "sentiment": 1 if "excelent" in review or "excell" in review or "ausgezeichnet" in review else 0,
            "language": lang,
            "user_id": f"synth_{lang}_{fake.uuid4()}",
            "gender": random.choice(["Male","Female"]),
            "age": random.randint(18,70),
            "travel_class": random.choice(["Economy","Business","Premium Economy"]),
            "satisfaction": random.choice(["Satisfied","Neutral or Dissatisfied"]),
            "date_time": fake.date_time_between(start_date='-3y', end_date='now'),
            "is_booking": random.choices([0,1], weights=[0.6,0.4])[0]
        })

synthetic_df = pd.DataFrame(synthetic_rows)

augmented_df = pd.concat([df, synthetic_df], ignore_index=True)

Generating es:   0%|          | 0/36594 [00:00<?, ?it/s]

Generating fr:   0%|          | 0/22389 [00:00<?, ?it/s]

Generating nl:   0%|          | 0/19284 [00:00<?, ?it/s]

Generating de:   0%|          | 0/46783 [00:00<?, ?it/s]

Generating it:   0%|          | 0/27895 [00:00<?, ?it/s]

In [16]:
save_path = "master_airline_dataset_augmented.csv"   # file lands in the current notebook folder
augmented_df.to_csv(save_path, index=False, encoding="utf-8")
print(" Saved at:", save_path)

 Saved at: master_airline_dataset_augmented.csv
