In [41]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV,train_test_split

from sklearn.preprocessing import LabelEncoder
import pickle


In [7]:
df = pd.read_csv('/content/seep_mela_train_clean.csv')

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Year                360 non-null    int64 
 1   Course              360 non-null    object
 2   Participants        360 non-null    int64 
 3   Sessions_Conducted  360 non-null    int64 
 4   Course_encoded      360 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 14.2+ KB


In [8]:
df.sample(3)

Unnamed: 0,Year,Course,Participants,Sessions_Conducted
52,2081,Beautician,340,4
75,2081,Plumbing,182,2
181,2082,Tailoring,185,2


In [26]:
df.isnull().sum()

Unnamed: 0,0
Year,0
Course,0
Participants,0
Sessions_Conducted,0
Course_encoded,0


In [9]:
le = LabelEncoder()
df['Course_encoded'] = le.fit_transform(df['Course'])

In [11]:
df['Course_encoded'].unique()

array([4, 0, 3, 1, 2, 5])

In [15]:
X = df[['Year', 'Course_encoded', 'Sessions_Conducted']]
y = df[['Participants']]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
#Creating base and fine tuned model
base_model = LinearRegression()
base_model.fit(X_train, y_train)

##Tuned model using Ridge

params = {'alpha':[0.1, 1.0, 10.0, 100.0]}
tuned_model = GridSearchCV(Ridge(), param_grid = params, cv = 5, scoring = 'neg_mean_squared_error')
tuned_model.fit(X_train, y_train)

In [25]:
#Best hyperparam alpha for tuned_model

print("Best Hyperparameters:", tuned_model.best_params_)

Best Hyperparameters: {'alpha': 1.0}


In [54]:
best_alpha = tuned_model.best_params_['alpha']
print(best_alpha)
final_tuned_model = Ridge(alpha = best_alpha)
final_tuned_model.fit(X_train, y_train)

1.0


In [55]:
#Evaluationg base model

y_pred_train_base = base_model.predict(X_train)
y_pred_test_base = base_model.predict(X_test)

mae_train_base = mean_absolute_error(y_train, y_pred_train_base)
mae_test_base = mean_absolute_error(y_test, y_pred_test_base)

r2_train_base = r2_score(y_train, y_pred_train_base)
r2_test_base = r2_score(y_test, y_pred_test_base)

print("MAE for base train:", mae_train_base)
print("MAE for base test:", mae_test_base)

print("R2 for base train:", r2_train_base)
print("R2 for base test:", r2_test_base)

MAE for base train: 9.834406301403448
MAE for base test: 10.456505190830931
R2 for base train: 0.977899736750578
R2 for base test: 0.9783486769511994


In [56]:
#Evaluationg tuned model

y_pred_train_tuned = final_tuned_model.predict(X_train)
y_pred_test_tuned = final_tuned_model.predict(X_test)

mae_train_tuned = mean_absolute_error(y_train, y_pred_train_tuned)
mae_test_tuned = mean_absolute_error(y_test, y_pred_test_tuned)

r2_train_tuned = r2_score(y_train, y_pred_train_tuned)
r2_test_tuned = r2_score(y_test, y_pred_test_tuned)

print("MAE for tuned train:", mae_train_tuned)
print("MAE for tuned test:", mae_test_tuned)

print("R2 for tuned train:", r2_train_tuned)
print("R2 for tuned test:", r2_test_tuned)

MAE for tuned train: 9.843248385191318
MAE for tuned test: 10.4592607537462
R2 for tuned train: 0.9778915967632139
R2 for tuned test: 0.9783320266700087


In [44]:
#Saving the base model
with open('base_model.pkl', 'wb') as file:
    pickle.dump(base_model, file)

In [57]:
#Saving the tuned model
with open('final_tuned_model.pkl', 'wb') as file:
    pickle.dump(final_tuned_model, file)

In [46]:
with open('base_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [47]:
df

Unnamed: 0,Year,Course,Participants,Sessions_Conducted,Course_encoded
0,2081,Tailoring,135,1,4
1,2081,Tailoring,347,4,4
2,2081,Tailoring,276,3,4
3,2081,Tailoring,344,4,4
4,2081,Tailoring,266,3,4
...,...,...,...,...,...
355,2082,Welding,269,3,5
356,2082,Welding,253,3,5
357,2082,Welding,259,3,5
358,2082,Welding,124,1,5


In [50]:
new_data = [[2082, 5, 3]]

print(f"For year {new_data[0][0]}, Welding, {new_data[0][2]} sessions provided, participants predicted:", int(loaded_model.predict(new_data)[0][0]))

For year 2082, Beautician, 3 sessions provided, participants predicted: 259


