In [1]:
import pandas as pd
import numpy as np

# Number of samples
n_samples = 2000
np.random.seed(42)

# Generate realistic lifestyle data
data = {
    "car_km_per_year": np.random.randint(0, 20000, n_samples),
    "public_transport_km_per_year": np.random.randint(0, 10000, n_samples),
    "flights_per_year": np.random.randint(0, 10, n_samples),
    "electricity_kwh_per_year": np.random.randint(1000, 6000, n_samples),
    "natural_gas_m3_per_year": np.random.randint(0, 3000, n_samples),
    "renewable_energy_percentage": np.random.randint(0, 100, n_samples),
    "diet_type": np.random.choice(["vegetarian", "vegan", "non_vegetarian"], n_samples, p=[0.3, 0.2, 0.5]),
    "meat_kg_per_year": np.random.randint(0, 100, n_samples),
    "waste_kg_per_year": np.random.randint(100, 1000, n_samples),
    "recycling_rate": np.random.randint(0, 100, n_samples),
    "house_size_m2": np.random.randint(20, 200, n_samples),
    "num_people_household": np.random.randint(1, 6, n_samples)
}

df = pd.DataFrame(data)

# Emission factors
EF_CAR = 0.2             # kg CO2 per km
EF_PUBLIC = 0.05         # kg CO2 per km
EF_FLIGHT = 250          # kg CO2 per flight
EF_ELECTRICITY = 0.5     # kg CO2 per kWh
EF_NATURAL_GAS = 2       # kg CO2 per m3
EF_MEAT = 27             # kg CO2 per kg
EF_WASTE = 1.8           # kg CO2 per kg

# Calculate emissions
car_emission = df["car_km_per_year"] * EF_CAR
public_emission = df["public_transport_km_per_year"] * EF_PUBLIC
flight_emission = df["flights_per_year"] * EF_FLIGHT
electricity_emission = df["electricity_kwh_per_year"] * EF_ELECTRICITY * (1 - df["renewable_energy_percentage"]/100)
gas_emission = df["natural_gas_m3_per_year"] * EF_NATURAL_GAS

# Food emission (vegetarian and vegan lower)
meat_factor = df["diet_type"].map({
    "non_vegetarian": 1.0,
    "vegetarian": 0.5,
    "vegan": 0.2
})
food_emission = df["meat_kg_per_year"] * EF_MEAT * meat_factor

# Waste emission (recycling reduces emissions)
waste_emission = df["waste_kg_per_year"] * EF_WASTE * (1 - df["recycling_rate"]/100)

# Total carbon footprint
df["carbon_footprint_kgCO2_per_year"] = (
    car_emission + public_emission + flight_emission +
    electricity_emission + gas_emission + food_emission +
    waste_emission
)

# Save dataset
df.to_csv("synthetic_carbon_footprint.csv", index=False)

print("Dataset generated and saved as synthetic_carbon_footprint.csv")
df.head()


Dataset generated and saved as synthetic_carbon_footprint.csv


Unnamed: 0,car_km_per_year,public_transport_km_per_year,flights_per_year,electricity_kwh_per_year,natural_gas_m3_per_year,renewable_energy_percentage,diet_type,meat_kg_per_year,waste_kg_per_year,recycling_rate,house_size_m2,num_people_household,carbon_footprint_kgCO2_per_year
0,15795,9917,6,1067,1526,18,vegetarian,49,475,75,181,4,9519.57
1,860,7574,8,4836,1877,76,non_vegetarian,39,154,46,162,2,8087.708
2,5390,1689,5,4993,1699,28,non_vegetarian,94,677,7,116,5,11279.228
3,11964,3267,9,3506,1029,60,non_vegetarian,2,838,53,72,3,8328.298
4,11284,4406,0,2537,499,69,vegan,16,125,8,164,1,4161.735
