In [5]:
# 📦 Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# ✅ Step 1: Load the CSV
df = pd.read_csv('SupplyChainEmissionFactorsforUSIndustriesCommodities2015_Summary (1) (2).csv')

# ✅ Step 2: Clean the data
df.drop(columns=[col for col in df.columns if 'Unnamed' in col], inplace=True)
df.dropna(subset=['Supply Chain Emission Factors with Margins'], inplace=True)

# ✅ Step 3: Encode 'Substance' column
substance_map = {'carbon dioxide': 0, 'methane': 1, 'nitrous oxide': 2, 'other GHGs': 3}
df['Substance'] = df['Substance'].map(substance_map)

# ✅ Step 4: Select features and target
df['Year'] = 2015  # Since it's only for 2015
features = ['Substance', 'Year']
target = 'Supply Chain Emission Factors with Margins'

X = df[features]
y = df[target]

# ✅ Step 5: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ✅ Step 6: Train the model
model = RandomForestRegressor()
model.fit(X_scaled, y)

# ✅ Step 7: Evaluate the model
y_pred = model.predict(X_scaled)
print("🎯 R2 Score:", r2_score(y, y_pred))
print("📉 MSE:", mean_squared_error(y, y_pred))

# ✅ Step 8: Save preprocessed data and model
preprocessed_df = pd.DataFrame(X_scaled, columns=features)
preprocessed_df['Target'] = y.values
preprocessed_df.to_csv('preprocessed_data.csv', index=False)
joblib.dump(model, 'model.pkl')

print("✅ Files saved: preprocessed_data.csv and model.pkl")


🎯 R2 Score: 0.2595532698383106
📉 MSE: 0.05250554049996335
✅ Files saved: preprocessed_data.csv and model.pkl
