In [1]:
# Create the approach.txt file
approach_text = """
Project: Carbon Footprint Estimation

Tools Used:
- Python
- Pandas
- Numpy
- Scikit-learn (RandomForestRegressor)

Approach:
1. Loaded the provided train and test datasets.
2. Performed preprocessing:
   - One-hot encoded categorical variables ('heating_type', 'diet_type').
   - Selected only numerical features.
3. Trained a Random Forest Regressor model.
4. Predicted carbon footprint for test data.

Feature Engineering:
- Encoded categorical columns.
- Selected numeric features.

Evaluation:
- R2 Score was used to evaluate performance during validation split.

Final Step:
- Submission CSV was created in correct format: [ID, carbon_footprint].

Notes:
- RandomForestRegressor with random_state=42.
- No missing values found in datasets.
"""

with open('approach.txt', 'w') as f:
    f.write(approach_text)

print("✅ approach.txt created!")


✅ approach.txt created!


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score


In [3]:
# 1. Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# # 2. Preprocessing
# categorical_cols = ['heating_type', 'diet_type']
# for col in categorical_cols:
#     le = LabelEncoder()
#     train[col] = le.fit_transform(train[col])
#     test[col] = le.transform(test[col])
# 🛠 Safe One-Hot Encoding (no crash)
full_data = pd.concat([train.drop(['carbon_footprint'], axis=1), test], axis=0)

# One-hot encode categorical features
full_data = pd.get_dummies(full_data, columns=['heating_type', 'diet_type'])

# Split back
train_encoded = full_data.iloc[:train.shape[0], :]
test_encoded = full_data.iloc[train.shape[0]:, :]

# Final features
X = train_encoded.drop(['ID'], axis=1)
y = train['carbon_footprint']
test_features = test_encoded.drop(['ID'], axis=1)

# 3. Separate features and target
X = train.drop(['ID', 'carbon_footprint'], axis=1)
y = train['carbon_footprint']
test_features = test.drop(['ID'], axis=1)

# 4. (Optional) Train-test split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Check for non-numeric columns
non_numeric_cols = X_train.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_cols)

# Look at weird values inside those columns
for col in non_numeric_cols:
    print(f"Unique values in {col}: {X_train[col].unique()}")
X_train = X_train.select_dtypes(include=[np.number])
X_val = X_val.select_dtypes(include=[np.number])
test_features = test_features.select_dtypes(include=[np.number])


# Only numeric features
X_train = X_train.select_dtypes(include=[np.number])
X_val = X_val.select_dtypes(include=[np.number])
test_features = test_features.select_dtypes(include=[np.number])

# Now model training
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# 5. Model training
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# 6. Validation
val_preds = model.predict(X_val)
score = r2_score(y_val, val_preds)
print("Validation R2 Score:", score)

# 7. Predict for test data
test_preds = model.predict(test_features)

# 8. Create submission file
submission = pd.DataFrame({'ID': test['ID'], 'carbon_footprint': test_preds})
submission.to_csv('submission.csv', index=False)


Non-numeric columns: Index(['house_area_sqft', 'household_size', 'heating_type', 'diet_type'], dtype='object')
Unique values in house_area_sqft: ['2071.62' '1545.31' '2568.69' ... '2549.02' '2576.97' '1763.05']
Unique values in household_size: ['8' '3' '2' '6' '1' '4' '5' '7' '_3&BJ' '8A:v|' 'U(+rW' '.Lve(' 't94wg'
 'g%Ygm' '9' 'V5MG0' '`|f"(' '?S\\<<' '75@Aa' 'nsF=>' 'H~c)|' ',-Y=p'
 'C+#QI' 'w}1=^' ']Fok^' 'Exrni' '*Mrwz' 'Tj5`5' 'i/@i%' 'NS&F!' '&}%ZS'
 '$pO&I' 't`{&4' '+#"JP' 'vwWrA' 'abGL%' '10' 'P`(`D' '[>8b1' 'oS$mL'
 "/XXN'" 'u]_o~' '29sS0' '}L@5{' 'U`Eq0' 'iemn%' 'Hk&,$' 'Npgwo' '@|&A}'
 'q=T=c' '(H9kM' 'w?n9s' './"Qq' '8fL[5' 'f(Sqd' ')c,zL' '`+=]_' '&V|2G'
 'aYc@0' 'QgFsg' '6X9eI' '0z\\>s' 'I<@*(' 'y|"5}' 'M56jV' 'G,/Aj' 'xe6oY'
 ']H`mb' 'CG`57' ')DpW(' '5~|L$' 'kO\\tG' ']1)q(' 'J)\\e7' 'X+i;c' 'EFj@['
 'q/IrZ' 'e;[lj' 'eKjw`' 'AUJ"o' "9@'R>" '1Ab"^' "L'm6J" 'keWOm' ',X)dx'
 'Q{gp@' 'hv75)' '?%W:p' 'h*_]1' '?cX:}' 'Rnt^Z' 'Rl]ca' 'pW=AX' 'YGxQF'
 'rrrCy' 'c/^R6' '/rEk*' '[L;

In [4]:
# 7. Predict on the test set
test_preds = model.predict(test_features)


In [None]:
# 8. Create Submission File

submission = pd.DataFrame({
    'ID': test['ID'],  # IDs from test.csv
    'carbon_footprint': test_preds  # Your model predictions
})

submission.to_csv('submission.csv', index=False)  # Save CSV (no index column)
