In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import pickle
from sklearn.metrics import classification_report
%matplotlib inline

In [2]:
#In the Dyslexia notebook, it was found that SVM with GridSearch is the best fit for the given dataset.
#This model gives the most accurate predictions.
#In this notebook, we will create only a SVM with GridSearch model, which will then be used to make final predictions.

In [3]:
#Reading the dataset
data=pd.read_csv('Project_dataset.csv')
#Value to be predicted by the model.
y=data.Label
#Input taken by the model.
X=data.drop(['Label'],axis=1)
data.head()

Unnamed: 0,LV,M,Speed,Visual,Audio,Survey_Score,Label
0,1.0,1.0,1,1.0,1.0,1.0,3
1,1.0,1.0,1,1.0,1.0,0.95,3
2,1.0,1.0,1,1.0,1.0,0.9,3
3,1.0,1.0,1,1.0,1.0,0.85,3
4,0.85,1.0,1,1.0,1.0,1.0,3


In [4]:
#In the given data, the label is the indication for whether the person has dyslexia or not.
#Label = 1 means that there is a high chance that the person has dyslexia.
#Label = 2 means that there is a moderate chance that the person has dyslexia.
#Label = 3 means that there is a low chance that the person has dyslexia.

In [5]:
#Creating the test and train data sets for the given data.
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.8,random_state=10)

In [6]:
#StandardScalar is used for preprocessing of data.
#'copy' is False, which means copies are avoid and inplace scaling is done instead.
sc=StandardScaler(copy=False)
sc.fit_transform(X_train)
sc.transform(X_test)

array([[ 0.41116739,  1.08991867,  1.03703491,  0.5971629 ,  0.83675148,
         0.81888678],
       [ 1.03453021, -0.40611619,  1.03703491,  0.5971629 ,  0.83675148,
        -0.3220837 ],
       [-1.50344699, -1.90215105, -0.96428769, -1.24124575,  0.83675148,
         0.49289521],
       ...,
       [-1.50344699, -0.40611619, -0.96428769, -0.32204142,  0.83675148,
        -0.3220837 ],
       [-1.50344699, -0.40611619, -0.96428769, -2.16045008,  0.83675148,
        -0.3220837 ],
       [-0.21219543, -0.40611619,  1.03703491,  0.5971629 ,  0.83675148,
         1.30787412]])

In [7]:
#options_parameters is a list of dictionaries to find the most suitable values of 'kernel', 'gamma' and 'C' for the given model.
options_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
#Creating SVM model with the most suitable parameters obtained by using GridSearch.
model = GridSearchCV(SVC(), options_parameters,scoring='f1_macro')
#Training the model.
model.fit(X_train, y_train)
#Making predictions using the model.
predictions = model.predict(X_test)
#Printing the values of 'C', 'gamma' and 'kernel' used in our model.
#These values provide the most accurate predictions for the given dataset.
print('Best parameters of SVM model are:')
print(model.best_params_)

Best parameters of SVM model are:
{'C': 1000, 'kernel': 'linear'}
