In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

test_ids = test['ID'].copy()

In [5]:
numerical_cols = ['electricity_kwh_per_month', 'natural_gas_therms_per_month', 'vehicle_miles_per_month',
                  'house_area_sqft', 'water_usage_liters_per_day', 'public_transport_usage_per_week',
                  'household_size', 'home_insulation_quality', 'meat_consumption_kg_per_week', 'laundry_loads_per_week']
binary_cols = ['recycles_regularly', 'composts_organic_waste', 'uses_solar_panels', 'energy_efficient_appliances',
               'smart_thermostat_installed', 'owns_pet']

In [6]:
for col in numerical_cols:
    train_numeric = pd.to_numeric(train[col], errors='coerce')
    test_numeric = pd.to_numeric(test[col], errors='coerce')
    median_value = train_numeric.median()
    train[col] = train_numeric.fillna(median_value)
    test[col] = test_numeric.fillna(median_value)

for col in binary_cols:
    train[col] = train[col].fillna(train[col].mode()[0])
    test[col] = test[col].fillna(train[col].mode()[0])

In [7]:
for col in ['electricity_kwh_per_month', 'natural_gas_therms_per_month', 'water_usage_liters_per_day',
            'meat_consumption_kg_per_week', 'laundry_loads_per_week']:
    train[col] = train[col].clip(lower=0)
    test[col] = test[col].clip(lower=0)
train['vehicle_miles_per_month'] = train['vehicle_miles_per_month'].clip(lower=0)
test['vehicle_miles_per_month'] = test['vehicle_miles_per_month'].clip(lower=0)
train['public_transport_usage_per_week'] = train['public_transport_usage_per_week'].clip(lower=0)
test['public_transport_usage_per_week'] = test['public_transport_usage_per_week'].clip(lower=0)
train['home_insulation_quality'] = train['home_insulation_quality'].clip(lower=0, upper=7)
test['home_insulation_quality'] = test['home_insulation_quality'].clip(lower=0, upper=7)

valid_heating_types = ['electric', 'gas', 'none']
valid_diet_types = ['omnivore', 'vegetarian', 'vegan']
train['heating_type'] = train['heating_type'].apply(lambda x: x if x in valid_heating_types else train['heating_type'].mode()[0])
test['heating_type'] = test['heating_type'].apply(lambda x: x if x in valid_heating_types else train['heating_type'].mode()[0])
train['diet_type'] = train['diet_type'].apply(lambda x: x if x in valid_diet_types else train['diet_type'].mode()[0])
test['diet_type'] = test['diet_type'].apply(lambda x: x if x in valid_diet_types else train['diet_type'].mode()[0])

for df in [train, test]:
    df['electricity_per_person'] = df['electricity_kwh_per_month'] / df['household_size']
    df['water_per_person'] = df['water_usage_liters_per_day'] / df['household_size']
    df['energy_inefficiency'] = df['house_area_sqft'] / (df['home_insulation_quality'] + 1)

In [8]:
train = pd.get_dummies(train, columns=['heating_type', 'diet_type'], drop_first=True)
test = pd.get_dummies(test, columns=['heating_type', 'diet_type'], drop_first=True)

feature_cols = train.columns.drop(['ID', 'carbon_footprint']).tolist()
test = test.reindex(columns=feature_cols, fill_value=0)

numerical_cols = numerical_cols + ['electricity_per_person', 'water_per_person', 'energy_inefficiency']
scaler = StandardScaler()
train[numerical_cols] = scaler.fit_transform(train[numerical_cols])
test[numerical_cols] = scaler.transform(test[numerical_cols])

X = train.drop(['ID', 'carbon_footprint'], axis=1)
y = train['carbon_footprint']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


model = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
r2 = r2_score(y_val, y_pred)
print(f'Validation R² Score: {r2:.4f}, Scaled Score: {max(0, 100 * r2):.2f}')

X_test = test 
test_predictions = model.predict(X_test)

submission = pd.DataFrame({'ID': test_ids, 'carbon_footprint': test_predictions})
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

Validation R² Score: 0.8965, Scaled Score: 89.65
Submission file created: submission.csv
