In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df=pd.read_csv('data/train.csv')

In [4]:
df.head()

Unnamed: 0,ID,electricity_kwh_per_month,natural_gas_therms_per_month,vehicle_miles_per_month,house_area_sqft,water_usage_liters_per_day,public_transport_usage_per_week,household_size,home_insulation_quality,meat_consumption_kg_per_week,laundry_loads_per_week,recycles_regularly,composts_organic_waste,uses_solar_panels,energy_efficient_appliances,heating_type,diet_type,owns_pet,smart_thermostat_installed,carbon_footprint
0,0xd6c,759.7,55.95,944.55,2422.07,541.27,1,3,2,4.23,9,1.0,0.0,0,1.0,gas,vegetarian,1,,830.1
1,0x3fdf,387.06,70.59,1280.85,1995.3,280.39,1,2,1,3.27,8,0.0,0.0,0,0.0,electric,vegetarian,0,0.0,963.08
2,0x3b08,594.25,29.14,1005.72,2673.55,416.14,0,2,3,2.87,3,0.0,1.0,0,1.0,electric,omnivore,1,1.0,840.11
3,0x31e5,503.76,74.68,1049.46,2994.28,530.13,0,5,1,3.22,9,1.0,0.0,0,0.0,electric,omnivore,1,0.0,1252.42
4,0x397e,549.54,-77.0,756.49,2549.57,604.1,5,4,4,2.92,2,1.0,0.0,0,1.0,electric,vegetarian,0,1.0,580.74


In [5]:
df.shape

(14000, 20)

### Dataset Columns and Descriptions

| **Column Name**                     | **Description**                                                                                      |
|-------------------------------------|------------------------------------------------------------------------------------------------------|
| `ID`                                | Represents a unique timestamp of an entry.                                                          |
| `electricity_kwh_per_month`         | Monthly electricity consumption in kilowatt-hours (kWh).                                            |
| `natural_gas_therms_per_month`      | Monthly natural gas usage measured in therms.                                                       |
| `vehicle_miles_per_month`           | Total miles driven by household vehicles per month.                                                 |
| `house_area_sqft`                   | Total living area of the household in square feet.                                                  |
| `water_usage_liters_per_day`        | Average daily water consumption in liters.                                                          |
| `public_transport_usage_per_week`   | Number of times public transportation is used per week.                                             |
| `household_size`                    | Number of people living in the household.                                                           |
| `home_insulation_quality`           | Rating of home insulation quality (0 = poor to 7 = excellent).                                      |
| `meat_consumption_kg_per_week`      | Weekly household meat consumption in kilograms.                                                     |
| `laundry_loads_per_week`            | Number of laundry machine loads per week.                                                           |
| `recycles_regularly`                | Whether the household recycles regularly (1 = Yes, 0 = No).                                         |
| `composts_organic_waste`            | Whether the household composts organic waste (1 = Yes, 0 = No).                                     |
| `uses_solar_panels`                 | Whether the household uses solar panels for electricity (1 = Yes, 0 = No).                          |
| `energy_efficient_appliances`       | Whether the household uses energy-efficient appliances (1 = Yes, 0 = No).                           |
| `heating_type`                      | Type of home heating system: `electric`, `gas`, or `none`.                                          |
| `diet_type`                         | Dietary profile of the household: `omnivore`, `vegetarian`, or `vegan`.                             |
| `owns_pet`                          | Whether the household owns pets (1 = Yes, 0 = No).                                                  |
| `smart_thermostat_installed`        | Whether the household uses a smart thermostat (1 = Yes, 0 = No).                                    |
| `carbon_footprint`                  | Estimated monthly carbon footprint in kg CO₂ equivalent.                                            |


In [6]:
df.isnull().sum()

ID                                   0
electricity_kwh_per_month            0
natural_gas_therms_per_month         0
vehicle_miles_per_month              0
house_area_sqft                      0
water_usage_liters_per_day           0
public_transport_usage_per_week      0
household_size                       0
home_insulation_quality              0
meat_consumption_kg_per_week         0
laundry_loads_per_week               0
recycles_regularly                 410
composts_organic_waste             390
uses_solar_panels                    0
energy_efficient_appliances        431
heating_type                         0
diet_type                            0
owns_pet                             0
smart_thermostat_installed         416
carbon_footprint                     0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14000 entries, 0 to 13999
Data columns (total 20 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ID                               14000 non-null  object 
 1   electricity_kwh_per_month        14000 non-null  float64
 2   natural_gas_therms_per_month     14000 non-null  float64
 3   vehicle_miles_per_month          14000 non-null  float64
 4   house_area_sqft                  14000 non-null  object 
 5   water_usage_liters_per_day       14000 non-null  float64
 6   public_transport_usage_per_week  14000 non-null  int64  
 7   household_size                   14000 non-null  object 
 8   home_insulation_quality          14000 non-null  int64  
 9   meat_consumption_kg_per_week     14000 non-null  float64
 10  laundry_loads_per_week           14000 non-null  int64  
 11  recycles_regularly               13590 non-null  float64
 12  composts_organic_w

In [9]:
df.nunique()

ID                                 14000
electricity_kwh_per_month          11401
natural_gas_therms_per_month        6385
vehicle_miles_per_month            13082
house_area_sqft                    13448
water_usage_liters_per_day         11336
public_transport_usage_per_week       28
household_size                       440
home_insulation_quality               14
meat_consumption_kg_per_week         723
laundry_loads_per_week                 9
recycles_regularly                     2
composts_organic_waste                 2
uses_solar_panels                      2
energy_efficient_appliances            2
heating_type                         431
diet_type                              3
owns_pet                               2
smart_thermostat_installed             2
carbon_footprint                   12478
dtype: int64

In [10]:
df.describe()

Unnamed: 0,electricity_kwh_per_month,natural_gas_therms_per_month,vehicle_miles_per_month,water_usage_liters_per_day,public_transport_usage_per_week,home_insulation_quality,meat_consumption_kg_per_week,laundry_loads_per_week,recycles_regularly,composts_organic_waste,uses_solar_panels,energy_efficient_appliances,owns_pet,smart_thermostat_installed,carbon_footprint
count,14000.0,14000.0,14000.0,14000.0,14000.0,14000.0,14000.0,14000.0,13590.0,13610.0,14000.0,13569.0,14000.0,13584.0,14000.0
mean,389.866107,47.588121,796.046458,385.497657,3.271357,3.003857,3.008954,5.005571,0.706034,0.393608,0.199571,0.498784,0.500143,0.301899,776.04601
std,119.070653,25.098685,340.217925,127.034232,2.942215,1.147641,1.464928,2.568548,0.455593,0.488568,0.399692,0.500017,0.500018,0.459099,190.432709
min,-99.0,-99.0,-1567.182896,-99.0,-14.0,-4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,310.0
25%,325.0675,35.41,587.525,322.5475,1.0,2.0,1.98,3.0,0.0,0.0,0.0,0.0,0.0,0.0,631.065
50%,394.995,49.205,792.885,395.325,3.0,3.0,2.99,5.0,1.0,0.0,0.0,0.0,1.0,0.0,763.725
75%,464.44,62.8725,1003.4625,464.6525,5.0,4.0,4.0,7.0,1.0,1.0,0.0,1.0,1.0,1.0,905.91
max,786.89,116.95,2697.807683,885.94,17.0,9.0,8.44,9.0,1.0,1.0,1.0,1.0,1.0,1.0,1610.0


In [12]:
numeric_features=[feature for feature in df.columns if df[feature].dtype !='O']
categorical_features=[feature for feature in df.columns if df[feature].dtype =='O']

print(f"Numeric columns are {numeric_features}")
print(f"Categorical columns are {categorical_features}")

Numeric columns are ['electricity_kwh_per_month', 'natural_gas_therms_per_month', 'vehicle_miles_per_month', 'water_usage_liters_per_day', 'public_transport_usage_per_week', 'home_insulation_quality', 'meat_consumption_kg_per_week', 'laundry_loads_per_week', 'recycles_regularly', 'composts_organic_waste', 'uses_solar_panels', 'energy_efficient_appliances', 'owns_pet', 'smart_thermostat_installed', 'carbon_footprint']
Categorical columns are ['ID', 'house_area_sqft', 'household_size', 'heating_type', 'diet_type']


##### errors="coerce" will turn invalid values (e.g., empty strings) into NaN, which we can handle later.

In [13]:
# Convert house_area_sqft to numeric
df["house_area_sqft"] = pd.to_numeric(df["house_area_sqft"], errors="coerce")

# Convert household_size to numeric
df["household_size"] = pd.to_numeric(df["household_size"], errors="coerce")


In [14]:
binary_cols = [
    "recycles_regularly",
    "composts_organic_waste",
    "energy_efficient_appliances",
    "smart_thermostat_installed"
]

# Impute with mode for each column
for col in binary_cols:
    mode_value = df[col].mode()[0]
    df[col] = df[col].fillna(mode_value).astype(int)


In [16]:
df.isnull().sum()

ID                                   0
electricity_kwh_per_month            0
natural_gas_therms_per_month         0
vehicle_miles_per_month              0
house_area_sqft                    419
water_usage_liters_per_day           0
public_transport_usage_per_week      0
household_size                     430
home_insulation_quality              0
meat_consumption_kg_per_week         0
laundry_loads_per_week               0
recycles_regularly                   0
composts_organic_waste               0
uses_solar_panels                    0
energy_efficient_appliances          0
heating_type                         0
diet_type                            0
owns_pet                             0
smart_thermostat_installed           0
carbon_footprint                     0
dtype: int64

In [17]:
# Median imputation
df["house_area_sqft"] = df["house_area_sqft"].fillna(df["house_area_sqft"].median())
df["household_size"] = df["household_size"].fillna(df["household_size"].median())


In [18]:
# Define target column
target_column = "carbon_footprint"

# Split into X and y
X = df.drop(columns=[target_column, "ID"])  # drop ID (not predictive)
y = df[target_column]


In [20]:
X.head()

Unnamed: 0,electricity_kwh_per_month,natural_gas_therms_per_month,vehicle_miles_per_month,house_area_sqft,water_usage_liters_per_day,public_transport_usage_per_week,household_size,home_insulation_quality,meat_consumption_kg_per_week,laundry_loads_per_week,recycles_regularly,composts_organic_waste,uses_solar_panels,energy_efficient_appliances,heating_type,diet_type,owns_pet,smart_thermostat_installed
0,759.7,55.95,944.55,2422.07,541.27,1,3.0,2,4.23,9,1,0,0,1,gas,vegetarian,1,0
1,387.06,70.59,1280.85,1995.3,280.39,1,2.0,1,3.27,8,0,0,0,0,electric,vegetarian,0,0
2,594.25,29.14,1005.72,2673.55,416.14,0,2.0,3,2.87,3,0,1,0,1,electric,omnivore,1,1
3,503.76,74.68,1049.46,2994.28,530.13,0,5.0,1,3.22,9,1,0,0,0,electric,omnivore,1,0
4,549.54,-77.0,756.49,2549.57,604.1,5,4.0,4,2.92,2,1,0,0,1,electric,vegetarian,0,1


In [21]:
y.head()

0     830.10
1     963.08
2     840.11
3    1252.42
4     580.74
Name: carbon_footprint, dtype: float64

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


In [23]:
# List all columns
all_columns = X.columns.tolist()

# Categorical columns
categorical_cols = ["heating_type", "diet_type"]

# Numeric columns
numeric_cols = [col for col in all_columns if col not in categorical_cols]


In [24]:
print("Numeric Columns:", numeric_cols)
print("Categorical Columns:", categorical_cols)


Numeric Columns: ['electricity_kwh_per_month', 'natural_gas_therms_per_month', 'vehicle_miles_per_month', 'house_area_sqft', 'water_usage_liters_per_day', 'public_transport_usage_per_week', 'household_size', 'home_insulation_quality', 'meat_consumption_kg_per_week', 'laundry_loads_per_week', 'recycles_regularly', 'composts_organic_waste', 'uses_solar_panels', 'energy_efficient_appliances', 'owns_pet', 'smart_thermostat_installed']
Categorical Columns: ['heating_type', 'diet_type']


In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Create ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_cols)
    ]
)


In [26]:
print(preprocessor)


ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['electricity_kwh_per_month',
                                  'natural_gas_therms_per_month',
                                  'vehicle_miles_per_month', 'house_area_sqft',
                                  'water_usage_liters_per_day',
                                  'public_transport_usage_per_week',
                                  'household_size', 'home_insulation_quality',
                                  'meat_consumption_kg_per_week',
                                  'laundry_loads_per_week',
                                  'recycles_regularly',
                                  'composts_organic_waste', 'uses_solar_panels',
                                  'energy_efficient_appliances', 'owns_pet',
                                  'smart_thermostat_installed']),
                                ('cat',
                                 OneHotEncoder(drop='first',
              

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

# Create the full pipeline
model_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("regressor", RandomForestRegressor(
        n_estimators=100,
        random_state=42
    ))
])


In [28]:
print(model_pipeline)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['electricity_kwh_per_month',
                                                   'natural_gas_therms_per_month',
                                                   'vehicle_miles_per_month',
                                                   'house_area_sqft',
                                                   'water_usage_liters_per_day',
                                                   'public_transport_usage_per_week',
                                                   'household_size',
                                                   'home_insulation_quality',
                                                   'meat_consumption_kg_per_week',
                                                   'laundry_loads_per_week',
                                                   'recycles_regularly',
                                 

In [29]:
model_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessing', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [31]:
from sklearn.metrics import r2_score, mean_squared_error

# Predict on train set
y_train_pred = model_pipeline.predict(X_train)

# Predict on test set
y_test_pred = model_pipeline.predict(X_test)

# Compute metrics
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
test_rmse = mean_squared_error(y_test, y_test_pred)

print(f"Train R2: {train_r2:.4f}")
print(f"Test R2: {test_r2:.4f}")
print(f"Test RMSE: {test_rmse:.2f}")


Train R2: 0.9784
Test R2: 0.8583
Test RMSE: 5207.48




In [33]:
# Load test data
test_df = pd.read_csv("data/test.csv")

# Convert to numeric
test_df["house_area_sqft"] = pd.to_numeric(test_df["house_area_sqft"], errors="coerce")
test_df["household_size"] = pd.to_numeric(test_df["household_size"], errors="coerce")

# Impute numeric columns with median (same logic as train)
test_df["house_area_sqft"] = test_df["house_area_sqft"].fillna(df["house_area_sqft"].median())
test_df["household_size"] = test_df["household_size"].fillna(df["household_size"].median())

# Impute binary columns with mode
binary_cols = [
    "recycles_regularly",
    "composts_organic_waste",
    "energy_efficient_appliances",
    "smart_thermostat_installed"
]

for col in binary_cols:
    mode_value = df[col].mode()[0]
    test_df[col] = test_df[col].fillna(mode_value).astype(int)


In [34]:
test_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 19 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ID                               6000 non-null   object 
 1   electricity_kwh_per_month        6000 non-null   float64
 2   natural_gas_therms_per_month     6000 non-null   float64
 3   vehicle_miles_per_month          6000 non-null   float64
 4   house_area_sqft                  6000 non-null   float64
 5   water_usage_liters_per_day       6000 non-null   float64
 6   public_transport_usage_per_week  6000 non-null   int64  
 7   household_size                   6000 non-null   float64
 8   home_insulation_quality          6000 non-null   int64  
 9   meat_consumption_kg_per_week     6000 non-null   float64
 10  laundry_loads_per_week           6000 non-null   int64  
 11  recycles_regularly               6000 non-null   int64  
 12  composts_organic_was

In [35]:
X_submission = test_df.drop(columns=["ID"])


In [36]:
submission_preds = model_pipeline.predict(X_submission)




In [37]:
submission_df = pd.DataFrame({
    "ID": test_df["ID"],
    "carbon_footprint": submission_preds
})


In [38]:
submission_df.to_csv("submission.csv", index=False)
