In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 1. Load Processed Data
print("Loading processed datasets...")
train_df = pd.read_csv('../data/train_processed.csv')
test_df = pd.read_csv('../data/test_processed.csv')

# 2. Preprocessing (Encoding Categoricals)
cat_cols = ['region_code', 'country_code']

# Combine to ensure consistent Label Encoding
all_data = pd.concat([train_df[cat_cols], test_df[cat_cols]], axis=0)
for col in cat_cols:
    le = LabelEncoder()
    le.fit(all_data[col].astype(str))
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))

# 3. Define Features (X) and Targets (y)
drop_cols = ['entity_id', 'region_name', 'country_name', 'target_scope_1', 'target_scope_2']
X = train_df.drop(columns=drop_cols)
# Ensure test set has same columns (handle any missing columns from pivot differences)
X_test = test_df.drop(columns=['entity_id', 'region_name', 'country_name'])
for col in X.columns:
    if col not in X_test.columns:
        X_test[col] = 0
X_test = X_test[X.columns] # Ensure order matches

# Log Transform Targets (Crucial for Emissions Data)
y_scope1 = np.log1p(train_df['target_scope_1'])
y_scope2 = np.log1p(train_df['target_scope_2'])

# 4. Model Training (Random Forest)
print("Training Scope 1 Model...")
model_s1 = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
model_s1.fit(X, y_scope1)

print("Training Scope 2 Model...")
model_s2 = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
model_s2.fit(X, y_scope2)

# 5. Inference & Submission
print("Generating predictions...")
pred_s1_log = model_s1.predict(X_test)
pred_s2_log = model_s2.predict(X_test)

# Inverse Transform (Log -> Normal)
final_s1 = np.expm1(pred_s1_log)
final_s2 = np.expm1(pred_s2_log)

# Create Submission File
submission = pd.DataFrame({
    'entity_id': test_df['entity_id'],
    'target_scope_1': final_s1,
    'target_scope_2': final_s2
})

submission.to_csv('submission.csv', index=False)
print("Done! 'submission.csv' created in notebooks folder.")