Support Vector Machine Model Optimization

In [None]:
import pandas as pd
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
df = pd.read_csv("math_train.csv")
df.head()

y = df["G3"] #separate target variable
X = df.drop(["G1", "G2", "G3"], axis=1) #drop unneeded columns from X

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,M,16,U,GT3,T,3,3,services,other,...,4,2,3,1,2,3,2,12,13,12
1,GP,M,16,U,GT3,T,2,3,other,other,...,5,3,3,1,1,3,0,13,14,14
2,GP,F,16,U,GT3,T,1,3,at_home,services,...,4,3,5,1,1,3,0,8,7,0
3,GP,F,16,U,GT3,T,4,3,teacher,health,...,3,4,4,2,4,4,2,10,9,9
4,GP,M,16,R,GT3,T,2,1,other,other,...,3,3,2,1,3,3,0,8,9,8


In [None]:
#one hot encode categorical variables in X
X_encode = pd.get_dummies(X, columns=['school', 'sex', 'age', 'address',
                                       'famsize', 'Pstatus', 'Mjob', 'Fjob',
                                       'reason', 'guardian', 'schoolsup',
                                       'famsup', 'paid', 'activities', 'nursery',
                                       'higher', 'internet', 'romantic'], drop_first=True)
X_encode.head()

Unnamed: 0,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,...,guardian_mother,guardian_other,schoolsup_yes,famsup_yes,paid_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_yes
0,3,3,1,2,0,4,2,3,1,2,...,True,False,False,False,True,True,True,True,True,True
1,2,3,2,1,0,5,3,3,1,1,...,False,False,False,False,False,False,True,True,True,False
2,1,3,1,2,3,4,3,5,1,1,...,True,False,False,False,False,True,False,True,True,True
3,4,3,1,3,0,3,4,4,2,4,...,True,False,True,True,True,True,True,True,True,False
4,2,1,2,1,0,3,3,2,1,3,...,True,False,False,False,False,True,False,True,False,False


In [None]:
#create pipeline that scales first then runs svr
svr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())])

#cross validate using grid search to find best kernel and C value
svr_param_grid = {'svr__kernel':['linear', 'rbf', 'poly', 'sigmoid'],
                  'svr__C': [0.01, 0.1, 1, 10, 100]}
svr_grid_search = GridSearchCV(svr_pipeline, svr_param_grid, cv=3)
svr_grid_search.fit(X_encode, y)

#find best parameters
best_svr = svr_grid_search.best_params_

#find best model and get r^2 and mse on it
best_model = svr_grid_search.best_estimator_
y_pred = best_model.predict(X_encode)
r2 = r2_score(y, y_pred)
mse = mean_squared_error(y, y_pred)

#print outputs
print(f"Best fit SVR model is {best_svr}")
print(f"R^2 score: {r2}")
print(f"MSE: {mse}")

Best fit SVR model is {'svr__C': 10, 'svr__kernel': 'rbf'}
R^2 score: 0.8459660846154745
MSE: 3.2354079178338724
