In [3]:
import numpy as np
import pandas as pd

print("Libraries ready for synthetic data generation!")


Libraries ready for synthetic data generation!


# Synthetic Air France dataset generation

## Design Logic

**1- Countries are grouped by region**

- France, Germany, Spain → Domestic / EU

- UK, Italy, Netherlands → EU 

- USA → always Intercontinental

**2- Distance, CO₂, and Ticket Price are calculated based on realistic ranges per region.**

**3- SAF Contribution (1/0) follows logical behavioral rules exactly as  listed following**

SAF contribution depends on 
- loyalty, 
- spending, 
- region, 
- class, 
- family size,
- past saf record

final SAF (1 or 0) depends on the combined weighted effect of:

loyalty tier + booking class + trip distance + add-on behavior + family size + past SAF record.

In [5]:
import numpy as np
import pandas as pd
import os
os.makedirs("../data", exist_ok=True)
#data.to_csv("../data/synthetic_airfrance_customer_data.csv", index=False)


np.random.seed(42)
n = 10_000


# 1. Define categories and mapping

countries = ["France", "Spain", "Italy", "UK", "Germany", "Netherlands", "USA"]
loyalty_tiers = ["Silver", "Gold", "Platinum"]
booking_classes = ["Economy", "Premium", "Business"]

# Fixed region mapping (rule-based)
region_map = {
    "France": "Domestic",
    "Spain": "Domestic",
    "Italy": "Domestic",
    "UK": "EU",
    "Germany": "EU",
    "Netherlands": "EU",
    "USA": "Intercontinental"
}

# Distance ranges (km)
distance_ranges = {
    "Domestic": (200, 1200),
    "EU": (600, 3500),
    "Intercontinental": (5000, 9000)
}

# 2. Generate base features

data = pd.DataFrame({
    "customer_id": range(1, n + 1),
    "country_of_origin": np.random.choice(countries, n, p=[0.2, 0.1, 0.1, 0.15, 0.15, 0.1, 0.2]),
    "loyalty_tier": np.random.choice(loyalty_tiers, n, p=[0.5, 0.3, 0.2]),
    "past_saf_purchase": np.random.choice([0, 1], n, p=[0.75, 0.25]),
    "adds_paid_options": np.random.choice([0, 1], n, p=[0.55, 0.45]),
    "booking_class": np.random.choice(booking_classes, n, p=[0.7, 0.2, 0.1]),
    "num_passengers": np.random.randint(1, 5, n),
    "length_of_stay_days": np.random.randint(2, 15, n)
})

# 3. Destination region & distance

data["destination_region"] = data["country_of_origin"].map(region_map)

def generate_distance(region):
    low, high = distance_ranges[region]
    return np.random.randint(low, high)

data["flight_distance_km"] = data["destination_region"].apply(generate_distance)

# 4. Ticket price & CO₂

def calc_ticket_price(row):
    class_multiplier = {"Economy": 0.15, "Premium": 0.25, "Business": 0.4}
    base = row["flight_distance_km"] * class_multiplier[row["booking_class"]]
    noise = np.random.uniform(50, 150)
    return round(base + noise, 2)

data["ticket_price_eur"] = data.apply(calc_ticket_price, axis=1)
data["co2_emission_kg"] = round(data["flight_distance_km"] * 0.09, 2)

# 5. Deterministic SAF Contribution (no randomness)

# Each factor gives a weighted score (0–1)
score = np.zeros(n)

# Loyalty impact
score += data["loyalty_tier"].map({
    "Silver": 0.10,
    "Gold": 0.20,
    "Platinum": 0.25
})

# Class impact
score += data["booking_class"].map({
    "Economy": 0.00,
    "Premium": 0.15,
    "Business": 0.25
})

# Region impact
score += data["destination_region"].map({
    "Domestic": 0.00,
    "EU": 0.05,
    "Intercontinental": 0.10
})

# Add-ons impact
score += 0.10 * data["adds_paid_options"]

# Past SAF record
score += 0.20 * data["past_saf_purchase"]

# Family size penalty
score -= 0.05 * (data["num_passengers"] > 3)

# Distance factor (normalize effect: longer flights = more likely)
score += np.interp(data["flight_distance_km"],
                   [200, 9000],
                   [0.00, 0.10])

# Clamp between 0 and 1
score = np.clip(score, 0, 1)

# Final deterministic SAF contribution
data["saf_contribution"] = (score > 0.5).astype(int)


# 6. Final output

data["flight_distance_km"] = data["flight_distance_km"].astype(int)
print(" Dterministic Air France dataset created successfully!")
data.head(20)


# QUICK EXPLANATION

# Factor                | Type / Meaning                  | Effect on SAF Score
# Loyalty tier           main eco-awareness indicator        up to +0.25
# Booking class          income / corporate sensitivity      up to +0.25
# Destination region     longer flights = more awareness     up to +0.10
# Add-ons                shows spending tendency             +0.10 if yes
# Past SAF record        strongest loyalty signal            +0.20 if yes
# Family size (>3)       price-sensitive customers           −0.05
# Distance               longer trips slightly help          +0.00 → +0.10

# Final Decision Rule:
#   If total weighted score ≥ 0.5 → saf_contribution = 1
#   Else → saf_contribution = 0



 Dterministic Air France dataset created successfully!


Unnamed: 0,customer_id,country_of_origin,loyalty_tier,past_saf_purchase,adds_paid_options,booking_class,num_passengers,length_of_stay_days,destination_region,flight_distance_km,ticket_price_eur,co2_emission_kg,saf_contribution
0,1,Italy,Silver,0,1,Economy,1,14,Domestic,603,174.03,54.27,0
1,2,USA,Silver,0,0,Economy,4,9,Intercontinental,8740,1426.02,786.6,0
2,3,Netherlands,Silver,0,1,Economy,2,8,EU,2154,437.46,193.86,0
3,4,Germany,Gold,0,0,Economy,4,4,EU,1236,306.53,111.24,0
4,5,France,Silver,0,1,Economy,2,12,Domestic,677,191.47,60.93,0
5,6,France,Platinum,0,1,Economy,2,12,Domestic,589,148.83,53.01,0
6,7,France,Silver,1,1,Economy,1,3,Domestic,1023,247.37,92.07,0
7,8,USA,Gold,0,0,Economy,3,12,Intercontinental,8280,1347.22,745.2,0
8,9,Germany,Gold,0,1,Economy,1,3,EU,1238,238.19,111.42,0
9,10,Netherlands,Gold,0,0,Economy,4,13,EU,2089,418.09,188.01,0
