In [1]:
!pip install scikit-learn
!pip install xgboost



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBRegressor

In [3]:
# Load the data from the raw data directory
df = pd.read_csv('../data/processed/featured_data.csv')

In [4]:
features = ['Incumbent', 'N_Cand', 'Year', 'Party','Alliance','State_Name','Constituency_Type']
target = 'Vote_Share'

In [5]:
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

display(X_train.head())
display(y_train.head())
display(X_test.head())
display(y_test.head())

Unnamed: 0,Incumbent,N_Cand,Year,Party,Alliance,State_Name,Constituency_Type
55292,False,24,1991,ind,Independent,bihar,GEN
50194,False,10,1996,ind,Independent,tamil_nadu,GEN
58265,False,21,1991,ddp,Other,madhya_pradesh,GEN
75994,False,10,1980,jnp,Other,haryana,GEN
24522,False,18,2009,ind,Independent,uttar_pradesh,GEN


55292     0.196768
50194     0.258802
58265     0.380763
75994    34.366784
24522     0.410724
Name: Vote_Share, dtype: float64

Unnamed: 0,Incumbent,N_Cand,Year,Party,Alliance,State_Name,Constituency_Type
65156,False,11,1989,ind,Independent,karnataka,GEN
39747,False,14,1998,pmsp,Other,uttar_pradesh,GEN
4402,False,17,2019,bsp,Other,maharashtra,GEN
33422,False,7,1999,jmm,UPA,odisha,GEN
71754,False,12,1984,bjp,NDA,madhya_pradesh,GEN


65156     0.760724
39747     0.206457
4402      1.083098
33422     0.536666
71754    28.589783
Name: Vote_Share, dtype: float64

In [6]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Drop rows with NaN values in y_train and corresponding rows in X_train
nan_mask = y_train.isna()
X_train = X_train[~nan_mask]
y_train = y_train[~nan_mask]


# Create the preprocessing steps for categorical and numerical features
categorical_features = ['Party', 'State_Name', 'Constituency_Type']
numerical_features = ['Incumbent', 'N_Cand', 'Year']

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])


# Create the Linear Regression pipeline
linear_regression_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                             ('regressor', LinearRegression())])


In [7]:
# Train the Linear Regression model
linear_regression_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [8]:
# Make predictions on the test data
y_pred_lr = linear_regression_pipeline.predict(X_test)

# Evaluate the Linear Regression model
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f"Linear Regression MSE: {mse_lr}")
print(f"Linear Regression R2 Score: {r2_lr}")

Linear Regression MSE: 71.38674384583474
Linear Regression R2 Score: 0.6972998368332088


## Ridge model

In [9]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

# Create the Ridge Regression pipeline
ridge_regression_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                            ('regressor', Ridge())])

# Train the Ridge Regression model
ridge_regression_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [10]:
# Make predictions on the test data
y_pred_ridge = ridge_regression_pipeline.predict(X_test)

# Evaluate the Ridge Regression model
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"Ridge Regression MSE: {mse_ridge}")
print(f"Ridge Regression R2 Score: {r2_ridge}")

Ridge Regression MSE: 71.52587530213525
Ridge Regression R2 Score: 0.6967098797591791


# Lasso model 

In [11]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Create the Lasso Regression pipeline
lasso_regression_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                            ('regressor', Lasso())])

# Train the Lasso Regression model
lasso_regression_pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,alpha,1.0
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [12]:
# Make predictions on the test data
y_pred_lasso = lasso_regression_pipeline.predict(X_test)

# Evaluate the Lasso Regression model
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f"Lasso Regression MSE: {mse_lasso}")
print(f"Lasso Regression R2 Score: {r2_lasso}")

Lasso Regression MSE: 153.24794014057827
Lasso Regression R2 Score: 0.3501850065369757


In [13]:
print("Model Performance Comparison:")
print(f"Linear Regression - MSE: {mse_lr:.4f}, R2 Score: {r2_lr:.4f}")
print(f"Ridge Regression - MSE: {mse_ridge:.4f}, R2 Score: {r2_ridge:.4f}")
print(f"Lasso Regression - MSE: {mse_lasso:.4f}, R2 Score: {r2_lasso:.4f}")

print("\nDiscussion:")
print("Based on the Mean Squared Error (MSE) and R-squared (R2) scores:")
print("- Linear Regression has the lowest MSE and the highest R2 score, indicating it explains the most variance in the target variable and has the lowest prediction error on the test set.")
print("- Ridge Regression has slightly higher MSE and slightly lower R2 compared to Linear Regression, suggesting that the L2 regularization had a minor impact on performance in this case.")
print("- Lasso Regression has a significantly higher MSE and a much lower R2 score compared to both Linear Regression and Ridge Regression. This suggests that the L1 regularization, which can lead to feature selection by shrinking some coefficients to zero, did not improve the model's performance on this dataset and might have removed some important features or the default alpha value is not optimal.")
print("\nConclusion:")
print("In this comparison, the standard Linear Regression model performed the best among the three in terms of both MSE and R2 score on the test data.")

Model Performance Comparison:
Linear Regression - MSE: 71.3867, R2 Score: 0.6973
Ridge Regression - MSE: 71.5259, R2 Score: 0.6967
Lasso Regression - MSE: 153.2479, R2 Score: 0.3502

Discussion:
Based on the Mean Squared Error (MSE) and R-squared (R2) scores:
- Linear Regression has the lowest MSE and the highest R2 score, indicating it explains the most variance in the target variable and has the lowest prediction error on the test set.
- Ridge Regression has slightly higher MSE and slightly lower R2 compared to Linear Regression, suggesting that the L2 regularization had a minor impact on performance in this case.
- Lasso Regression has a significantly higher MSE and a much lower R2 score compared to both Linear Regression and Ridge Regression. This suggests that the L1 regularization, which can lead to feature selection by shrinking some coefficients to zero, did not improve the model's performance on this dataset and might have removed some important features or the default alpha v

## Perform Grid search CV on ridge and lasso

In [None]:
param_grid_ridge = {'regressor__alpha': [0.1,0.5, 1.0, 10.0, 100.0]}
param_grid_lasso = {'regressor__alpha': [0.001, 0.01, 0.1, 1.0]}

In [15]:
from sklearn.model_selection import GridSearchCV

# Perform Grid Search Cross-Validation for Ridge Regression
grid_search_ridge = GridSearchCV(ridge_regression_pipeline, param_grid_ridge, cv=5, scoring='r2')
grid_search_ridge.fit(X_train, y_train)

0,1,2
,estimator,"Pipeline(step...r', Ridge())])"
,param_grid,"{'regressor__alpha': [0.1, 0.5, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,alpha,0.5
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [16]:
print("Best parameters for Ridge Regression:", grid_search_ridge.best_params_)
print("Best R2 score for Ridge Regression:", grid_search_ridge.best_score_)

Best parameters for Ridge Regression: {'regressor__alpha': 0.5}
Best R2 score for Ridge Regression: 0.6869674996286083


In [17]:
from sklearn.metrics import mean_squared_error, r2_score

# Get the best Ridge model from Grid Search
best_ridge_model = grid_search_ridge.best_estimator_

# Make predictions on the test data
y_pred_best_ridge = best_ridge_model.predict(X_test)

# Evaluate the best Ridge Regression model
mse_best_ridge = mean_squared_error(y_test, y_pred_best_ridge)
r2_best_ridge = r2_score(y_test, y_pred_best_ridge)

print(f"Best Ridge Regression MSE on Test Data: {mse_best_ridge:.4f}")
print(f"Best Ridge Regression R2 Score on Test Data: {r2_best_ridge:.4f}")

Best Ridge Regression MSE on Test Data: 71.4550
Best Ridge Regression R2 Score on Test Data: 0.6970


In [18]:
from sklearn.model_selection import GridSearchCV

# Perform Grid Search Cross-Validation for Lasso Regression
grid_search_lasso = GridSearchCV(lasso_regression_pipeline, param_grid_lasso, cv=5, scoring='r2')
grid_search_lasso.fit(X_train, y_train)

0,1,2
,estimator,"Pipeline(step...r', Lasso())])"
,param_grid,"{'regressor__alpha': [0.001, 0.01, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,alpha,0.001
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [19]:
print("Best parameters for Lasso Regression:", grid_search_lasso.best_params_)
print("Best R2 score for Lasso Regression:", grid_search_lasso.best_score_)

Best parameters for Lasso Regression: {'regressor__alpha': 0.001}
Best R2 score for Lasso Regression: 0.6837953858804714


In [20]:
# Get the best Lasso model from Grid Search
best_lasso_model = grid_search_lasso.best_estimator_

# Make predictions on the test data
y_pred_best_lasso = best_lasso_model.predict(X_test)

# Evaluate the best Lasso Regression model
mse_best_lasso = mean_squared_error(y_test, y_pred_best_lasso)
r2_best_lasso = r2_score(y_test, y_pred_best_lasso)

print(f"Best Lasso Regression MSE on Test Data: {mse_best_lasso:.4f}")
print(f"Best Lasso Regression R2 Score on Test Data: {r2_best_lasso:.4f}")

Best Lasso Regression MSE on Test Data: 72.4179
Best Lasso Regression R2 Score on Test Data: 0.6929


In [21]:
print("Best parameters for Ridge Regression (from Grid Search):", grid_search_ridge.best_params_)
print("Best R2 score for Ridge Regression (from Grid Search):", grid_search_ridge.best_score_)
print("\nBest parameters for Lasso Regression (from Grid Search):", grid_search_lasso.best_params_)
print("Best R2 score for Lasso Regression (from Grid Search):", grid_search_lasso.best_score_)

print("\nModel Performance Comparison on Test Data (after Grid Search):")
print(f"Best Ridge Regression - MSE: {mse_best_ridge:.4f}, R2 Score: {r2_best_ridge:.4f}")
print(f"Best Lasso Regression - MSE: {mse_best_lasso:.4f}, R2 Score: {r2_best_lasso:.4f}")

print("\nSummary:")
print("After performing Grid Search Cross-Validation to tune the hyperparameters:")
print(f"- The best alpha for Ridge Regression was found to be {grid_search_ridge.best_params_['regressor__alpha']}.")
print(f"- The best alpha for Lasso Regression was found to be {grid_search_lasso.best_params_['regressor__alpha']}.")
print("\nComparing the performance on the test set:")
if r2_best_ridge > r2_best_lasso:
    print("- The best Ridge Regression model performed slightly better than the best Lasso Regression model, with a higher R2 score and lower MSE.")
    print("This suggests that for this dataset and feature set, L2 regularization (Ridge) was more effective than L1 regularization (Lasso) after tuning, or that the chosen alpha values for Lasso were not optimal.")
elif r2_best_lasso > r2_best_ridge:
    print("- The best Lasso Regression model performed slightly better than the best Ridge Regression model, with a higher R2 score and lower MSE.")
    print("This suggests that L1 regularization (Lasso), which can lead to sparsity by shrinking some coefficients to zero, might be slightly more suitable for this dataset after tuning.")
else:
    print("- The performance of the best Ridge and Lasso Regression models is very similar on the test set.")
print("\nImpact of Hyperparameter Tuning:")
print("Hyperparameter tuning with Grid Search helped identify potentially better 'alpha' values for both Ridge and Lasso models compared to using default values. The performance metrics on the test set provide a more reliable estimate of how the models would perform on unseen data with their optimized hyperparameters.")

Best parameters for Ridge Regression (from Grid Search): {'regressor__alpha': 0.5}
Best R2 score for Ridge Regression (from Grid Search): 0.6869674996286083

Best parameters for Lasso Regression (from Grid Search): {'regressor__alpha': 0.001}
Best R2 score for Lasso Regression (from Grid Search): 0.6837953858804714

Model Performance Comparison on Test Data (after Grid Search):
Best Ridge Regression - MSE: 71.4550, R2 Score: 0.6970
Best Lasso Regression - MSE: 72.4179, R2 Score: 0.6929

Summary:
After performing Grid Search Cross-Validation to tune the hyperparameters:
- The best alpha for Ridge Regression was found to be 0.5.
- The best alpha for Lasso Regression was found to be 0.001.

Comparing the performance on the test set:
- The best Ridge Regression model performed slightly better than the best Lasso Regression model, with a higher R2 score and lower MSE.
This suggests that for this dataset and feature set, L2 regularization (Ridge) was more effective than L1 regularization (La

## XGBoost boosting techniques

In [22]:
# Create the XGBoost Regression pipeline
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', XGBRegressor(random_state=42))])

In [23]:
# Define a smaller parameter grid for XGBoost to start
param_grid_xgb = {
    'regressor__n_estimators': [100,200, 300,400, 500],
    'regressor__learning_rate': [0.01, 0.1],
    'regressor__max_depth': [3,4, 5,6],
    'regressor__subsample': [0.7, 1.0]
}

# Perform Grid Search Cross-Validation for XGBoost
# We use 'n_jobs=-1' to use all available CPU cores, which speeds things up
grid_search_xgb = GridSearchCV(xgb_pipeline, 
                               param_grid_xgb, 
                               cv=5, 
                               scoring='r2', 
                               n_jobs=-1, 
                               verbose=1)

print("Starting XGBoost Grid Search... (This may take a few minutes)")
grid_search_xgb.fit(X_train, y_train)
print("XGBoost Grid Search complete.")

Starting XGBoost Grid Search... (This may take a few minutes)
Fitting 5 folds for each of 80 candidates, totalling 400 fits
XGBoost Grid Search complete.


In [24]:
# Print the best parameters found
print("Best parameters for XGBoost Regression:", grid_search_xgb.best_params_)
print("Best R2 score for XGBoost Regression (from CV):", grid_search_xgb.best_score_)

# Get the best XGBoost model from Grid Search
best_xgb_model = grid_search_xgb.best_estimator_

# Make predictions on the test data
y_pred_best_xgb = best_xgb_model.predict(X_test)

# Evaluate the best XGBoost Regression model
mse_best_xgb = mean_squared_error(y_test, y_pred_best_xgb)
r2_best_xgb = r2_score(y_test, y_pred_best_xgb)

print(f"\nBest XGBoost Regression MSE on Test Data: {mse_best_xgb:.4f}")
print(f"Best XGBoost Regression R2 Score on Test Data: {r2_best_xgb:.4f}")

Best parameters for XGBoost Regression: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 6, 'regressor__n_estimators': 500, 'regressor__subsample': 0.7}
Best R2 score for XGBoost Regression (from CV): 0.8601422202936773

Best XGBoost Regression MSE on Test Data: 31.1722
Best XGBoost Regression R2 Score on Test Data: 0.8678


In [25]:
import joblib
import os
from pathlib import Path

output_dir = Path('../models')
# Per your request, we are saving the XGBoost model to this specific path
output_path = os.path.join(output_dir, 'best_ridge_model.joblib')

os.makedirs(output_dir, exist_ok=True)

# Save the best XGBoost model
joblib.dump(best_xgb_model, output_path)

print(f"Best XGBoost model saved to: {output_path}")

Best XGBoost model saved to: ..\models\best_ridge_model.joblib


In [26]:
# import joblib
# import os
# from pathlib import Path

# output_dir = Path('../models')
# output_path = os.path.join(output_dir, 'best_ridge_model.joblib')

# os.makedirs(output_dir, exist_ok=True)

# joblib.dump(best_ridge_model, output_path)

# # print(f"Best Ridge model saved to: {output_path}")