### Student Grade Predictions

#### Source

P. Cortez and A. Silva. Using Data Mining to Predict Secondary School Student Performance. In A. Brito and J. Teixeira Eds., Proceedings of 5th FUture BUsiness TEChnology Conference (FUBUTEC 2008) pp. 5-12, Porto, Portugal, April, 2008, EUROSIS, ISBN 978-9077381-39-7.

#### Dataset

The data set provides information about student achievement in two portuguese secondary school. 

The target variable are the final scores in mathematics ranging from 0 to 20.

The objective of this project is to build a model to predict the final scores of the students

##### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.set_option('display.max_columns',500)

##### Importing Dataset

In [2]:
df_prep = pd.read_csv("D:\Desktop\DF_Prepr.csv",sep="\t") # Math Dataset

In [3]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_prep,test_size = 0.2, random_state = 42)
train = train.reset_index()
test = test.reset_index()

In [4]:
Xtrain = train.drop("G3", axis= 1)

ytrain = train["G3"]

Xtest = test.drop("G3", axis= 1)

ytest = test["G3"]

#### BaseLine - Linear Regression

In [5]:
from sklearn.linear_model import LinearRegression

In [6]:
lin_reg = LinearRegression()

In [7]:
lin_reg.fit(Xtrain,ytrain) #Training happens

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [8]:
from sklearn.model_selection import cross_val_score

In [9]:
scores = cross_val_score(lin_reg,Xtrain,ytrain, scoring = "neg_mean_squared_error", cv = 10)

In [10]:
np.sqrt(-scores)

array([4.77678908, 4.62457808, 3.13654741, 4.52838392, 4.53449856,
       4.66094426, 4.79453806, 4.64633312, 4.05524399, 3.83441766])

In [11]:
from sklearn.metrics import mean_squared_error

In [12]:
predictions = lin_reg.predict(Xtest).round()

In [13]:
lin_mse = mean_squared_error(ytest, predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

4.056562116192928

In [14]:
pd.DataFrame({"Actual": ytest, "Predicted": predictions}).head()

Unnamed: 0,Actual,Predicted
0,10,4.0
1,12,8.0
2,5,9.0
3,10,10.0
4,9,9.0


##### Lasso

In [15]:
lasso_params = {'alpha':[0.00001,0.0001,0.001,0.01,0.1,10,100,1000]}

In [16]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [17]:
lasso_reg = Lasso(normalize = True)

In [18]:
grid_search = GridSearchCV(lasso_reg,lasso_params,cv = 10,scoring = "neg_mean_squared_error")

In [19]:
grid_search.fit(Xtrain,ytrain)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0.1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [20]:
grid_search.best_params_

{'alpha': 0.01}

In [21]:
lasso_final = grid_search.best_estimator_

In [22]:
lass_pred = lasso_final.predict(Xtest)

In [23]:
np.sqrt(mean_squared_error(ytest,lass_pred))

4.149234622543421

##### Ridge

In [24]:
from sklearn.linear_model import Ridge

In [25]:
ridge_params = {'alpha':[0.01, 0.1, 10, 100, 1000]}

In [26]:
ridge_reg = Ridge()

In [27]:
grid_search2 = GridSearchCV(ridge_reg,ridge_params,cv = 5,scoring = "neg_mean_squared_error")

In [28]:
grid_search2.fit(Xtrain,ytrain)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': [0.01, 0.1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [29]:
grid_search2.best_params_

{'alpha': 10}

In [30]:
ridge_final = grid_search2.best_estimator_

In [31]:
ridge_pred = ridge_final.predict(Xtest)

In [32]:
np.sqrt(mean_squared_error(ytest,ridge_pred))

4.044529317909106

#### Elastic Net CV

In [33]:
from sklearn.linear_model import ElasticNetCV

In [34]:
E_reg = ElasticNetCV(cv=5,alphas=[
                       0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.], random_state=0)

In [35]:
E_reg.fit(Xtrain,ytrain)

ElasticNetCV(alphas=[0.0125, 0.025, 0.05, 0.125, 0.25, 0.5, 1.0, 2.0, 4.0],
       copy_X=True, cv=5, eps=0.001, fit_intercept=True, l1_ratio=0.5,
       max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
       positive=False, precompute='auto', random_state=0,
       selection='cyclic', tol=0.0001, verbose=0)

In [36]:
E_reg_pred = E_reg.predict(Xtest)

In [37]:
np.sqrt(mean_squared_error(ytest,E_reg_pred))

4.084531717094732

#### KNN Classifier

In [38]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

grid_params = {
    "n_neighbors" : [1,2,3,4,5,6,7,8,9,10],
}

In [39]:
gs = GridSearchCV(
KNeighborsRegressor(),
grid_params,
verbose = 1,
cv = 10,scoring='neg_mean_squared_error',
n_jobs = -1
)

In [40]:
gs_results = gs.fit(Xtrain,ytrain)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.5s finished


In [41]:
knn_best = gs_results.best_estimator_

In [42]:
knn_prep = knn_best.predict(Xtest)

In [43]:
np.sqrt(mean_squared_error(ytest,knn_prep))

4.267792243410098

In [44]:
gs_results.best_params_

{'n_neighbors': 10}

#### SVM

In [45]:
from sklearn import svm

In [46]:
Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}

In [47]:
grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=10,scoring='neg_mean_squared_error')

In [48]:
grid_search.fit(Xtrain, ytrain)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [49]:
grid_search.best_params_

{'C': 0.001, 'gamma': 0.001}

In [50]:
svm_best = grid_search.best_estimator_

In [51]:
svm_prep = svm_best.predict(Xtest)

In [52]:
np.sqrt(mean_squared_error(ytest,svm_prep))

4.593611413994964

#### Random Forrest

In [53]:
from sklearn.ensemble import RandomForestRegressor

rnd_reg = RandomForestRegressor(n_estimators=10, max_leaf_nodes = 16)
rnd_reg.fit(Xtrain,ytrain)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=16,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [54]:
rnd_pred = rnd_reg.predict(Xtest)

In [55]:
np.sqrt(mean_squared_error(ytest,rnd_pred))

4.1091380643445286

In [56]:
param_grid = {
    'bootstrap': [True,False],
    'max_depth': [5,10,15,20,30,40,50,60,70,80,None],
    'max_features': [2,5,10],
    'n_estimators': [10,30,50,70,80,90]
}

In [57]:
grid_search = GridSearchCV(estimator = rnd_reg, param_grid = param_grid, 
                          cv = 10,scoring='neg_mean_squared_error', n_jobs = -1, verbose = 2)

In [58]:
grid_search.fit(Xtrain,ytrain)

Fitting 10 folds for each of 396 candidates, totalling 3960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 992 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 2616 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done 3960 out of 3960 | elapsed:   35.7s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=16,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'bootstrap': [True, False], 'max_depth': [5, 10, 15, 20, 30, 40, 50, 60, 70, 80, None], 'max_features': [2, 5, 10], 'n_estimators': [10, 30, 50, 70, 80, 90]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=2)

In [59]:
rf = grid_search.best_estimator_

In [60]:
prf = rf.predict(Xtest)

In [61]:
np.sqrt(mean_squared_error(ytest,prf))

4.095114295820432

In [62]:
rf

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=5,
           max_features=5, max_leaf_nodes=16, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=None, oob_score=False,
           random_state=None, verbose=0, warm_start=False)

#### AdaBoost

In [63]:
from sklearn import model_selection
from sklearn.ensemble import AdaBoostClassifier

In [64]:
model = AdaBoostClassifier(n_estimators=3, random_state=101)

In [65]:
model.fit(Xtrain,ytrain)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=3, random_state=101)

In [66]:
adar = model.predict(Xtest)

In [67]:
np.sqrt(mean_squared_error(ytest,adar))

5.255497423132939