LINEAR REGRESSION 

In [None]:
import numpy as np 
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('CO2_Emissions_Canada.csv')
df.head(5)

In [None]:
df.nunique()

In [None]:
df.isnull().values.sum(axis=0)

In [112]:
class TargetEncoder :
    
    def __init__(self, target, smooth_k=5):
        self.target = target
        self.smooth_k = smooth_k
        self.original_feature = None
        self.mapping = defaultdict(lambda:0)
        self.global_mean = 0
        
    def fit(self, X_train, y_train):
        self.original_feature = X_train.name
        self.global_mean = y_train.mean()

        stats = X_train.to_frame(self.original_feature).copy()
        stats[self.target] = y_train
        
        agg = stats.groupby(self.original_feature)[self.target].agg(['count', 'mean'])
        agg['smooth_mean'] = (agg['count'] * agg['mean'] + self.smooth_k * self.global_mean) / (agg['count']+self.smooth_k)
        self.mapping = agg['smooth_mean'].to_dict()
        return self 
    
    def transform(self, X):
        return X.map(self.mapping).fillna(self.global_mean)
    
    def fit_transform(self, X_train, y_train):
        self.fit(X_train,y_train)
        return self.transform(X_train)
    

In [113]:


def preprocessed_data(df, target_encoder = None, fit_mode = False):
    TARGET = 'CO2 Emissions(g/km)'
    HIGH_CARDINALITY_FEATURE = 'Model' 
    LOW_CARD_FEATURES = ['Make', 'Vehicle Class', 'Transmission', 'Fuel Type']
    NUMERICAL_FEATURES = ['Engine Size(L)', 'Cylinders', 'Fuel Consumption City (L/100 km)', 
                          'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)', 
                          'Fuel Consumption Comb (mpg)']
    if fit_mode:
        encoders = {
            'target_encoder': TargetEncoder(TARGET, smooth_k=5),
            'imputers': {},
            'label_encoders': {}
        }
    else:
        if target_encoder is None:
            raise ValueError("TargetEncoder must be provided when not in fit_mode.")
        encoders = target_encoder
        
    df_processed = df.copy()
    
    y_target = df_processed[TARGET] if TARGET in df_processed.columns else None
    X_features = df_processed.drop(columns=[TARGET]) if TARGET in df_processed.columns else df_processed
    
    all_features = NUMERICAL_FEATURES + LOW_CARD_FEATURES + [HIGH_CARDINALITY_FEATURE]
    for col in all_features:
        if col in X_features.columns and X_features[col].isnull().any():
            if col in NUMERICAL_FEATURES:
                impute_func = 'mean'
            else: 
                impute_func = 'mode'
                
            if fit_mode:
                imputer_val = X_features[col].mode()[0] if impute_func == 'mode' else X_features[col].mean()
                encoders['imputers'][col] = imputer_val
    if fit_mode:
        X_features[HIGH_CARDINALITY_FEATURE + '_encoded'] = encoders['target_encoder'].fit_transform(
            X_features[HIGH_CARDINALITY_FEATURE], y_target
        )
    else:
        X_features[HIGH_CARDINALITY_FEATURE + '_encoded'] = encoders['target_encoder'].transform(
            X_features[HIGH_CARDINALITY_FEATURE]
        )
    X_features = X_features.drop(columns=[HIGH_CARDINALITY_FEATURE])
    
    for col in LOW_CARD_FEATURES:
        if col in X_features.columns:
            if fit_mode:
                le = LabelEncoder()
                X_features[col + '_encoded'] = le.fit_transform(X_features[col])
                encoders['label_encoders'][col] = le
            else:
                le = encoders['label_encoders'][col]
                mapping = {cls: i for i, cls in enumerate(le.classes_)}
                X_features[col + '_encoded'] = X_features[col].map(mapping).fillna(-1) 
            
            X_features = X_features.drop(columns=[col])
    if fit_mode:
        X_features[TARGET] = y_target 
        return X_features, encoders
    else:
        return X_features, encoders
    

In [114]:
shuffle_df = df.sample(frac=1, random_state=42)
train_ratio = 0.75
split_index = int(train_ratio*len(shuffle_df))
train_df = shuffle_df[:split_index].reset_index(drop=True)
test_df = shuffle_df[split_index:].reset_index(drop=True)

In [None]:
train_df.shape

In [None]:
test_df.shape

In [117]:
train_processed, encoders = preprocessed_data(train_df, fit_mode=True)
test_processed, encoders = preprocessed_data(test_df, fit_mode=True)

In [None]:
train_processed.head(5)

In [119]:
TARGET = 'CO2 Emissions(g/km)'

X_train = train_processed.drop(TARGET, axis=1)
y_train = train_processed[TARGET]

X_test = test_processed.drop(columns=[TARGET], errors='ignore') 
y_test = test_df[TARGET]

In [120]:
linReg = LinearRegression()
linReg.fit(X_train,y_train)
predict = linReg.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

mse = mean_squared_error(y_test, predict)
print(mse)
r2 = r2_score(y_test, predict)
print(r2)

In [None]:
alpha_ridge = 1  # A common starting value. You can tune this later!
print(f"\n--- 3B. Training Ridge Regression Model (Alpha={alpha_ridge}) ---")
    
model_ridge = Ridge(alpha=alpha_ridge)
    # FITTING THE MODEL
model_ridge.fit(X_train, y_train) 
    
    # Make Predictions
y_pred_ridge = model_ridge.predict(X_test)
    
    # Evaluate Ridge
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
print(f"Ridge R2 Score: {r2_ridge:.4f} | MSE: {mse_ridge:.2f}")
    
    # 7. Final Comparison and Coefficients

    
if r2_ridge > r2:
    print("ðŸŽ‰ Ridge Regression performed better on the test set!")
    best_model = model_ridge
    best_X_train = X_train
    r2_best = r2_ridge
    mse_best = mse_ridge
else:
    print("Standard Linear Regression performed better or equally well.")
    best_model = linReg
    best_X_train = X_train
    r2_best = r2
    mse_best = mse
        
print(f"BEST MODEL R2 Score: {r2_best:.4f}, MSE: {mse_best:.2f}")

In [123]:
def plot_actual_vs_predicted(y_test, y_pred, model_name):
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.7, color='teal', edgecolors='w')
    
    min_val = min(y_test.min(), y_pred.min())
    max_val = max(y_test.max(), y_pred.max())
    plt.plot([min_val, max_val], [min_val, max_val], 
             color='orangered', linestyle='--', linewidth=2, label='Perfect Prediction')
    
    plt.title(f'Actual vs. Predicted CO2 Emissions ({model_name})', fontsize=16)
    plt.xlabel('Actual CO2 Emissions (g/km)', fontsize=14)
    plt.ylabel('Predicted CO2 Emissions (g/km)', fontsize=14)
    plt.grid(True, linestyle=':', alpha=0.6)
    plt.legend()
    plt.show()
    
def plot_coefficients(coefficients, model_name):

    coefficients = coefficients.sort_values(ascending=False)
    
    plt.figure(figsize=(12, 6))
    colors = ['green' if c > 0 else 'red' for c in coefficients.values]
    coefficients.plot(kind='bar', color=colors)
    
    plt.title(f'Model Coefficients for {model_name}', fontsize=16)
    plt.ylabel('Coefficient Value', fontsize=14)
    plt.xlabel('Feature', fontsize=14)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout() 
    plt.show()

In [None]:
coefficients = pd.Series(best_model.coef_, index = best_X_train.columns)
print(coefficients.sort_values(ascending=True))

In [None]:
best_model_name = "Ridge Regression" if isinstance(best_model, Ridge) else "LinearRegression"
y_pred_best = best_model.predict(X_test) 
plot_actual_vs_predicted(y_test, y_pred_best, best_model_name)
plot_coefficients(coefficients, best_model_name)