In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
import pickle


Import the CSV Data as Pandas DataFrame

In [2]:
df = pd.read_csv("data/cleaned.csv")

In [3]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,total score,average
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333


droping total score columns, As was showing multicollinearity with average

In [4]:
df.drop(columns=["total score"], inplace=True)

In [5]:
X = df.drop(columns=["average"], axis=1)
y = df["average"]

In [6]:
numeric_features = [feature for feature in X.columns if X[feature].dtype != "O"]
numeric_features

['math score', 'reading score', 'writing score']

In [7]:
# ohe = one hot encoding
# using categorical columns
ohe_columns = [feature for feature in X.columns if X[feature].dtype == "O"]
ohe_columns

['gender',
 'race/ethnicity',
 'parental level of education',
 'lunch',
 'test preparation course']

In [8]:
scale = ColumnTransformer(transformers=[ ("scale", StandardScaler(), slice(0,11))], remainder="passthrough")

tnf = ColumnTransformer(transformers=[("ohe", OneHotEncoder(sparse_output=False, drop="first"), [0,1,2,3,4])], remainder="passthrough")


Using Linear Regression

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=198)
pipe = Pipeline([
            ("tnf", tnf),
            ("scale", scale),
            ("model", LinearRegression(fit_intercept=True))    
])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print("R2 SCORE", r2_score(y_test, y_pred))
print("MAE", mean_absolute_error(y_test, y_pred))

R2 SCORE 1.0
MAE 1.1404210908949608e-14


Using Ridge Regression

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=198)
pipe = Pipeline( [ ("tnf", tnf), ("scale", scale), ("model", Ridge(fit_intercept=True)) ])

pipe.fit(X_train, y_train)

# Make Predictions
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

# Training set performance
model_train_r2score = r2_score(y_train, y_train_pred)
model_train_mae = mean_absolute_error(y_train, y_train_pred)

# Testing set Performance
model_test_r2score = r2_score(y_test, y_test_pred)
model_test_mae = mean_absolute_error(y_test, y_test_pred)

print("Model Performance for Training Set")
print(f"R2: {model_train_r2score:.4f}")
print(f"MAE : {model_test_mae:.4f}")

print("Model Performance for Testing Set")
print(f"R2: {model_test_r2score:.4f}")
print(f"MAE: {model_test_mae:.4f}")


Model Performance for Training Set
R2: 1.0000
MAE : 0.0001
Model Performance for Testing Set
R2: 1.0000
MAE: 0.0001


Using Lasso Regression

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.3, random_state=728)
pipe = Pipeline( [ ("tnf", tnf),  ("scale", scale), ("model", Lasso(fit_intercept=True))])

pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

# Training Set Performance
model_train_r2score = r2_score(y_train, y_train_pred)
model_train_mae = mean_absolute_error(y_train, y_train_pred)

# Testing Set Performance
model_test_r2score = r2_score(y_test, y_test_pred)
model_test_mae = mean_absolute_error(y_test, y_test_pred)

print("Model Performance of Training Dataset")
print(f"R2: {model_train_r2score:.4f}")
print(f"MAE: {model_train_mae:.4f}")

print("Model Performance of Testing Dataset")
print(f"R2: {model_test_r2score:.4f}")
print(f"MAE: {model_test_mae:.4f}")

Model Performance of Training Dataset
R2: 1.0000
MAE: 0.0600
Model Performance of Testing Dataset
R2: 1.0000
MAE: 0.0601


Using Support Vector Regression

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.3, random_state=480)

pipe = Pipeline( [("tnf", tnf), ("scale", scale),
                  ("model", SVR(C=1, kernel="linear", gamma="auto", max_iter=5000))] )
pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

model_train_r2score = r2_score(y_train, y_train_pred)
model_train_mae = mean_absolute_error(y_train, y_train_pred)

model_test_r2score = r2_score(y_test, y_test_pred)
model_test_mae = mean_absolute_error(y_test, y_test_pred)

print("Model Performance for Training Set")
print(f"R2: {model_train_r2score:.4f}")
print(f"MAE: {model_train_mae:.4f}")

print("Model Performance for Testing Set")
print(f"R2: {model_test_r2score:.4f}")
print(f"MAE: {model_test_mae:.4f}")


Model Performance for Training Set
R2: 1.0000
MAE: 0.0363
Model Performance for Testing Set
R2: 1.0000
MAE: 0.0345


Using KNeighborsRegressor

In [19]:
X_train,X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=458)

pipe = Pipeline( [  ("tnf", tnf), ("scale", scale), ("model", KNeighborsRegressor())]) 

pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

# Training Set Performance
model_train_r2score = r2_score(y_train, y_train_pred)
model_train_mae = mean_absolute_error(y_train, y_train_pred)

# Testing Set Performance
model_test_r2score = r2_score(y_test, y_test_pred)
model_test_mae = mean_absolute_error(y_test, y_test_pred)

print("Model Performance for Training Set")
print(f"R2: {model_train_r2score:.4f}")
print(f"MAE: {model_train_mae:.4f}")


print("Model Performance for Testing Set")
print(f"R2: {model_test_r2score:.4f}")
print(f"MAE: {model_test_mae:.4f}")

Model Performance for Training Set
R2: 0.9972
MAE: 0.5027
Model Performance for Testing Set
R2: 0.9861
MAE: 0.7432


Using DecisionTreeRegressor

In [22]:
X_train,X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=657)

pipe = Pipeline( [  ("tnf", tnf), ("scale", scale), ("model", DecisionTreeRegressor())]) 

pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

# Training Set Performance
model_train_r2score = r2_score(y_train, y_train_pred)
model_train_mae = mean_absolute_error(y_train, y_train_pred)

# Testing Set Performance
model_test_r2score = r2_score(y_test, y_test_pred)
model_test_mae = mean_absolute_error(y_test, y_test_pred)

print("Model Performance for Training Set")
print(f"R2: {model_train_r2score:.4f}")
print(f"MAE: {model_train_mae:.4f}")


print("Model Performance for Testing Set")
print(f"R2: {model_test_r2score:.4f}")
print(f"MAE: {model_test_mae:.4f}")

Model Performance for Training Set
R2: 1.0000
MAE: 0.0000
Model Performance for Testing Set
R2: 0.9774
MAE: 1.5686


Using RandomForestRegressor

In [21]:
X_train,X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=256)

pipe = Pipeline( [  ("tnf", tnf), ("scale", scale), ("model", RandomForestRegressor())]) 

pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

# Training Set Performance
model_train_r2score = r2_score(y_train, y_train_pred)
model_train_mae = mean_absolute_error(y_train, y_train_pred)

# Testing Set Performance
model_test_r2score = r2_score(y_test, y_test_pred)
model_test_mae = mean_absolute_error(y_test, y_test_pred)

print("Model Performance for Training Set")
print(f"R2: {model_train_r2score:.4f}")
print(f"MAE: {model_train_mae:.4f}")


print("Model Performance for Testing Set")
print(f"R2: {model_test_r2score:.4f}")
print(f"MAE: {model_test_mae:.4f}")

Model Performance for Training Set
R2: 0.9987
MAE: 0.3364
Model Performance for Testing Set
R2: 0.9835
MAE: 1.0271


In [26]:
import pickle
pickle.dump(pipe, open("models/model.pkl", "wb"))