In [5]:
# basic import 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

# model imports 
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor


In [6]:
df = pd.read_csv("data/raw.csv")

In [8]:
df.head(5)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group D,master's degree,standard,none,62,70,75
1,female,group C,bachelor's degree,free/reduced,completed,66,83,83
2,female,group D,some college,free/reduced,none,79,89,86
3,male,group C,master's degree,free/reduced,none,61,67,66
4,male,group E,high school,standard,none,73,64,57


In [9]:
X=df.drop(columns=['math_score'],axis=1)

In [10]:
X

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group D,master's degree,standard,none,70,75
1,female,group C,bachelor's degree,free/reduced,completed,83,83
2,female,group D,some college,free/reduced,none,89,86
3,male,group C,master's degree,free/reduced,none,67,66
4,male,group E,high school,standard,none,64,57
...,...,...,...,...,...,...,...
995,female,group D,high school,free/reduced,completed,57,56
996,male,group E,associate's degree,standard,completed,56,53
997,female,group B,some college,free/reduced,none,81,76
998,female,group C,associate's degree,standard,none,77,74


In [11]:
y = df['math_score']

In [13]:
y.head(2)

0    62
1    66
Name: math_score, dtype: int64

In [15]:
numerical_features = [features for features in X if X[features].dtype!='O']
categorical_features  = X.select_dtypes(include='object').columns
print(numerical_features)
print(categorical_features)

['reading_score', 'writing_score']
Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course'],
      dtype='object')


In [22]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer


numerical_transform = StandardScaler()
categorical_transform = OneHotEncoder()

preprocessor = ColumnTransformer (
    [("OHE",categorical_transform,categorical_features),
     ("standardscaler",numerical_transform,numerical_features),
     ]
)

In [23]:
X = preprocessor.fit_transform(X)

In [24]:
X.shape

(1000, 19)

In [26]:
X

array([[1.        , 0.        , 0.        , ..., 1.        , 0.05694554,
        0.45733301],
       [1.        , 0.        , 0.        , ..., 0.        , 0.94779033,
        0.98406266],
       [1.        , 0.        , 0.        , ..., 1.        , 1.35894946,
        1.18158627],
       ...,
       [1.        , 0.        , 0.        , ..., 1.        , 0.81073728,
        0.52317422],
       [1.        , 0.        , 0.        , ..., 1.        , 0.53663119,
        0.39149181],
       [1.        , 0.        , 0.        , ..., 0.        , 0.33105163,
        0.2598094 ]])

In [29]:
# seprate data into train and test split 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
print(X_train.shape,X_test.shape)

(700, 19) (300, 19)


In [46]:
# crearte a evaluate function 
def evaluate_models(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [41]:
models={
    "LinearRegression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "RandomForestRegressor":RandomForestRegressor(),
    "XGBRegressor":XGBRegressor(),
    "SVR":SVR(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "KNeighborsRegressor":KNeighborsRegressor()
}

model_list = []
r2_list =[]

In [47]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_models(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_models(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

LinearRegression
Model performance for Training set
- Root Mean Squared Error: 5.3389
- Mean Absolute Error: 4.2520
- R2 Score: 0.8788
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.4128
- Mean Absolute Error: 4.3004
- R2 Score: 0.8647


Lasso
Model performance for Training set
- Root Mean Squared Error: 6.5944
- Mean Absolute Error: 5.2185
- R2 Score: 0.8151
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6.0630
- Mean Absolute Error: 4.7691
- R2 Score: 0.8302


Ridge
Model performance for Training set
- Root Mean Squared Error: 5.3288
- Mean Absolute Error: 4.2438
- R2 Score: 0.8792
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.3735
- Mean Absolute Error: 4.2695
- R2 Score: 0.8666


RandomForestRegressor
Model performance for Training set
- Root Mean Squared Error: 2.3355
- Mean Absolute Error: 1.8458
- R2 Score: 0.9768
------------------------