# Student Performance Dataset

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport

from sklearn.metrics import *
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

### Data

In [None]:
data = pd.read_csv("StudentsPerformance.csv")
data.columns = data.columns.str.replace(' ', '_')
data.head()

In [None]:
data.info()

In [None]:
# Numeric Columns

numeric_cols = data.select_dtypes(["int64","float64"]).columns.tolist()
numeric_cols

In [None]:
# Categorical Columns

cat_cols = data.select_dtypes("object").columns.tolist()
cat_cols

In [None]:
# Categorical Values

cat_vals = [data[c].unique().tolist() for c in cat_cols]
cat_vals

### Train - Test Split

In [None]:
X = data.drop(columns=("math_score"))
y = data["math_score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

### EDA

In [None]:
training_data = pd.concat([X_train,y_train], axis=1)

profile = ProfileReport(training_data, explorative=True)
profile

### Pre-Processing

In [None]:
# Handling Numeric Data

numerical_pipe = Pipeline([('scaler', StandardScaler())])

In [None]:
# Handling Categorical Data

ohe = OneHotEncoder(categories=cat_vals)
categorical_pipe = Pipeline([('one_hot', ohe)])

In [None]:
# Defining the Pre-Processing Transformer

preprocessor = ColumnTransformer(transformers= [('numerical', numerical_pipe, numeric_cols),
    ('categorical', categorical_pipe, cat_cols)],
    remainder = 'passthrough')

### Model Building

In [None]:
# Initializing Models

model1 = LinearRegression()
#model2 = Ridge()
#model3 = Lasso()
#model4 = RandomForestRegressor()
#model5 = KNeighborsRegressor()

In [None]:
# Initializing Hyperparameters

param1 = {}
param1['model'] = [model1]

# param2 = {}
# param2['model__alpha'] = [0,0.5,1,5,10]
# param2['model'] = [model2]

# param3 = {}
# param3['model__alpha'] = [0,0.5,1,5,10]
# param3['model'] = [model3]

# param4 = {}
# param4['model__n_estimators'] = [10,50]
# param4['model__max_depth'] = [5,10]
# param4['model'] = [model4]

# param5 = {}
# param5['model__n_neighbors'] = [2,5,10]
# param5['model'] = [model5]

In [None]:
# Combined Pipeline

pipe = Pipeline([
    ('column_transformer', preprocessor),
    ('model', model1)
])

params = [param1,param2,param3, param4, param5]

In [None]:
# Defining a function to fit the pipeline and return evaluation metric

def fit_and_print(input_pipeline,
                 X_train = X_train,
                 y_train = y_train,
                 X_test = X_test,
                 y_test = y_test):
    
    input_pipeline.fit(X_train,y_train)
    test_preds = input_pipeline.predict(X_test)
    
    print("Testing Accuracy : " + str(accuracy_score(y_test,test_preds)))

In [None]:
pipe

In [None]:
pipe.get_params().keys()

### Grid Search CV

In [None]:
my_scorer = make_scorer(mean_absolute_error, greater_is_better=True)
grid_search = GridSearchCV(pipe, params, cv=3, scoring=my_scorer, n_jobs=-1).fit(X_train, y_train)
grid_search.best_params_

In [None]:
grid_search

In [None]:
grid_search.fit(X_train,y_train)
pipe.set_params(**search.best_params_)
fit_and_print(pipe)

In [None]:
params

In [None]:
pipe