In [1]:
# Basic Import
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, RandomForestClassifier
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [2]:
df = pd.read_csv('data/stud.csv')

In [3]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
df['total_score'] = df.math_score + df.writing_score + df.reading_score
df['average_score'] = df.total_score / 3

In [5]:
X = df.drop(columns=['total_score'], axis=1)
y = df['total_score']

In [6]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude=['object']).columns
cat_features = X.select_dtypes(include=['object']).columns

In [7]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer([
    ("OneHotEncoder", oh_transformer, cat_features),
    ("StandardScaler", numeric_transformer, num_features),
])

In [8]:
X = preprocessor.fit_transform(X)

In [9]:
# Splitting data into train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [10]:
# Create an Evaluate function to give all metrics after the model Training

def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, predicted)
    return mae, mse, rmse, r2
    

In [11]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "XGBRegresor": XGBRegressor(),
    "CatBoostRegressor": CatBoostRegressor(),
    "AdaBoostRegressor": AdaBoostRegressor()
}



for model_name, model in models.items():
    print("Model: ", model_name)
    model.fit(X_train, y_train)
    
    # Make Predictions
    y_train_pred = model.predict(X_train)   
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    mae_train, mse_train, rmse_train, r2_train = evaluate_model(y_train,y_train_pred)
    mae_test, mse_test, rmse_test, r2_test = evaluate_model(y_test,y_test_pred)
    print('Metrics on Train Data\n')
    print(f'Model {model_name} - MAE: {mae_train} - MSE: {mse_train} - RMSE: {rmse_train} - R2: {r2_train}')
    print('\n')
    print('Metrics on Test Data\n')
    print(f'Model {model_name} - MAE: {mae_test} - MSE: {mse_test} - RMSE: {rmse_test} - R2: {r2_test}')
    print('\n'*4)
    print('='*80)
    

Model:  Linear Regression
Metrics on Train Data

Model Linear Regression - MAE: 2.5135449277513545e-14 - MSE: 1.2073989470950718e-27 - RMSE: 3.4747646641104656e-14 - R2: 1.0


Metrics on Test Data

Model Linear Regression - MAE: 2.4655832930875478e-14 - MSE: 1.2551092546428386e-27 - RMSE: 3.542752114730635e-14 - R2: 1.0





Model:  Lasso
Metrics on Train Data

Model Lasso - MAE: 0.8075024891061188 - MSE: 1.0181431180113025 - RMSE: 1.0090307814984152 - R2: 0.9994328035509327


Metrics on Test Data

Model Lasso - MAE: 0.8311744383271958 - MSE: 1.1145578149772182 - RMSE: 1.0557262026573075 - R2: 0.999422299105941





Model:  Ridge
Metrics on Train Data

Model Ridge - MAE: 0.014251319027234057 - MSE: 0.0003095530621786959 - RMSE: 0.01759412010242899 - R2: 0.9999998275513584


Metrics on Test Data

Model Ridge - MAE: 0.01468389341662423 - MSE: 0.0003733648904949883 - RMSE: 0.019322652263470155 - R2: 0.9999998064764087





Model:  K-Nearest Neighbors
Metrics on Train Data

Model K-Nearest

In [15]:
model = LinearRegression()
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
mae_test, mse_test, rmse_test, r2_test = evaluate_model(y_test,y_test_pred)
print(f'Model {model} - MAE: {mae_test} - MSE: {mse_test} - RMSE: {rmse_test} - R2: {r2_test}')
print(f'True Values: {y_test} Predicted Values: {y_test_pred}')

Model LinearRegression() - MAE: 2.4655832930875478e-14 - MSE: 1.2551092546428386e-27 - RMSE: 3.542752114730635e-14 - R2: 1.0
True Values: 521    261
737    192
740    225
660    224
411    245
      ... 
408    165
332    171
208    231
613    216
78     207
Name: total_score, Length: 200, dtype: int64 Predicted Values: [261. 192. 225. 224. 245. 234. 202. 177. 232. 146. 141.  78. 228. 175.
 251. 225. 147. 145. 160. 184. 220. 148. 188. 132. 238. 232. 216. 150.
 123. 167. 180. 201. 181. 219. 232. 153. 228. 223. 225.  27. 234. 193.
 196. 184. 252. 197. 213.  88. 259. 236. 216. 214. 236. 162. 215. 217.
 249. 164. 245. 267. 130. 242. 237. 185. 263. 221. 195. 164. 194. 266.
 193. 259. 169. 150. 180. 230. 269. 170. 179. 147. 184. 119. 234. 206.
 203. 193. 223. 204. 252.  70. 231. 289. 184. 213.  89. 207. 182. 195.
 220. 194. 223. 175. 177. 205. 209. 195. 138. 227. 229. 157. 117. 240.
 162. 168. 247. 172. 146. 144. 210. 235. 190. 191. 246. 220. 198. 201.
 228. 218. 144. 208. 228. 224. 142. 282