In [1]:
# %install_ext https://raw.github.com/cpcloud/ipython-autotime/master/autotime.py
# %load_ext autotime

In [2]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Loading Data

In [3]:
arr2= np.load('data_array_non_formatted.npy', allow_pickle= True)
X= arr2[:,0:-1]
Y= arr2[:,-1]

In [4]:
### Split arrays or matrices into random train and test subsets - 20% for Test, 80% to train
X_train, X_test, y_train, y_test= train_test_split(X, Y, test_size=0.20, random_state=101)

print(X_train,y_train)
print(len(X_train),len(y_train),len(X_test),len(y_test))

[[4000000 6.292114 100.0 ... 0 0 1]
 [2500000 0.853938 64.0 ... 1 0 0]
 [1200000 2.017413 113.0 ... 1 0 0]
 ...
 [25000000 3.65507 95.0 ... 0 0 0]
 [5000000 5.839893 107.0 ... 0 0 0]
 [176199 0.7532270000000001 104.0 ... 0 0 0]] ['Average' 'Average' 'Good' ... 'Bad' 'Average' 'Good']
5253 5253 1314 1314


# Default SVM Classifier

In [5]:
model = SVC()

In [7]:
model.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [8]:
predictions = model.predict(X_test)

In [9]:
from sklearn.metrics import classification_report, confusion_matrix

In [10]:
print(confusion_matrix(y_test,predictions))

[[1045    0    0]
 [ 183    0    0]
 [  86    0    0]]


In [12]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

     Average       0.80      1.00      0.89      1045
         Bad       0.00      0.00      0.00       183
        Good       0.00      0.00      0.00        86

    accuracy                           0.80      1314
   macro avg       0.27      0.33      0.30      1314
weighted avg       0.63      0.80      0.70      1314



In [14]:
model.score (X_test, y_test)

0.7952815829528158

In [16]:
model.score (X_train, y_train)

0.791738054445079

In [18]:
f1_score(y_test,predictions,average='weighted')

0.7045945351298791

In [11]:
#everything is being classified into one class (average).
#this means the model needs to have its parameters adjusted

# Finding best C and best Kernel
## to modify hyperparams specific to that kernel

In [12]:
# C param is kernel agnostic
# C - controls cost of misclassification on the training data.
# large C -- low bias (penalize cost of wrong classify), high variance
# small C -- higher bias, lower variance

In [13]:
C_params = [0.1,1,10,100,1000]
kernel_list = ['rbf', 'linear', 'polynomial','sigmoid']

In [14]:
test_score = []
train_score = []
f1_scores = []

# Best C with RBF Kernel

In [15]:
%%time

for currentC in C_params: 
        model = SVC(C=currentC, kernel='rbf')
        model.fit(X_train, y_train) 
        test_score.append(model.score (X_test, y_test))
        train_score.append(model.score (X_train, y_train))
        y_pred =model.predict(X_test)
        f1_scores.append(f1_score(y_test,y_pred,average='weighted'))

CPU times: user 42.4 s, sys: 120 ms, total: 42.5 s
Wall time: 42.5 s


In [16]:
df = pd.DataFrame(list(zip(C_params,test_score,train_score,f1_scores)),columns = ['C','test_score','train_score','f1_scores'])

In [17]:
df.head()

Unnamed: 0,C,test_score,train_score,f1_scores
0,0.1,0.795282,0.791738,0.704595
1,1.0,0.795282,0.791738,0.704595
2,10.0,0.795282,0.791738,0.704595
3,100.0,0.792237,0.791928,0.70309
4,1000.0,0.792237,0.791928,0.70309


In [18]:
np.save("rbf",np.array(df))

# Best C with Linear Kernel

In [14]:
test_score = []
train_score = []
f1_scores = []

In [None]:
%%time

model = SVC(C=1, kernel='linear')
model.fit(X_train, y_train) 


In [None]:
# Not feasible to apply linear SVM to our data. Just one run takes over 40 minutes
# Default SVM with kernel="linear" scales badly in terms of complexity
# especially with multiclass data, scikit-learn will automatically use OneVsRest or OneVsAll approaches to do this

In [None]:
#SVC and NuSVC implement the “one-against-one” approach (Knerr et al., 1990) for multi- class classification. 
# If n_class is the number of classes, then n_class * (n_class - 1) / 2 classifiers are constructed and 
# each one trains data from two classes. To provide a consistent interface with other classifiers, 
# the decision_function_shape option allows to monotically transform the results of the “one-against-one” 
# classifiers to a decision function of shape

In [None]:
#On the other hand, LinearSVC implements “one-vs-the-rest” multi-class strategy, 
#thus training n_class models. If there are only two classes, only one model is trained:

In [5]:
from sklearn.svm import LinearSVC

In [6]:
linear_svm = LinearSVC()

In [19]:
linear_svm.fit(X_train, y_train) 




LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [20]:
#Lets increase the maximum number of iterations to the size of the dataset

In [55]:
len(X_train)

CPU times: user 10 µs, sys: 0 ns, total: 10 µs
Wall time: 16.7 µs


5253

In [24]:
linear_svm = LinearSVC(max_iter=5500)

In [25]:
linear_svm.fit(X_train, y_train) 



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=5500,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [26]:
# Dual: Select the algorithm to either solve the dual or primal optimization problem
# Prefer dual=False when n_samples > n_features.

In [29]:
linear_svm = LinearSVC(max_iter=5500, dual=False)

In [30]:
linear_svm.fit(X_train, y_train) #it works

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=5500,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [9]:
C_params = [0.1,1,10,100,1000]
test_score = []
train_score = []
f1_scores = []

In [10]:
%%time

for currentC in C_params: 
        model = LinearSVC(max_iter=5500, dual=False)
        model.fit(X_train, y_train) 
        test_score.append(model.score (X_test, y_test))
        train_score.append(model.score (X_train, y_train))
        y_pred =model.predict(X_test)
        f1_scores.append(f1_score(y_test,y_pred,average='weighted'))

CPU times: user 806 ms, sys: 12.5 ms, total: 819 ms
Wall time: 161 ms


In [11]:
df = pd.DataFrame(list(zip(C_params,test_score,train_score,f1_scores)),columns = ['C','test_score','train_score','f1_scores'])

In [12]:
df.head()

Unnamed: 0,C,test_score,train_score,f1_scores
0,0.1,0.795282,0.791738,0.704595
1,1.0,0.795282,0.791738,0.704595
2,10.0,0.795282,0.791738,0.704595
3,100.0,0.795282,0.791738,0.704595
4,1000.0,0.795282,0.791738,0.704595


In [13]:
np.save("linearSVC",np.array(df))

# Best C with Sigmoid Kernel

In [26]:
sigmoid_model = SVC(C=currentC, kernel='sigmoid')

In [27]:
sigmoid_model.fit(X_train, y_train)  #fast

SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='sigmoid',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [28]:
test_score = []
train_score = []
f1_scores = []

In [29]:
%%time

for currentC in C_params: 
    sigmoid_model = SVC(C=currentC, kernel='sigmoid')        
    sigmoid_model.fit(X_train, y_train) 
    test_score.append(sigmoid_model.score (X_test, y_test))
    train_score.append(sigmoid_model.score (X_train, y_train))
    y_pred =sigmoid_model.predict(X_test)
    f1_scores.append(f1_score(y_test,y_pred,average='weighted'))

CPU times: user 6.04 s, sys: 0 ns, total: 6.04 s
Wall time: 6.04 s


In [30]:
df = pd.DataFrame(list(zip(C_params,test_score,train_score,f1_scores)),columns = ['C','test_score','train_score','f1_scores'])

In [33]:
df.head()

Unnamed: 0,C,test_score,train_score,f1_scores
0,0.1,0.745814,0.731772,0.705217
1,1.0,0.723744,0.70988,0.696193
2,10.0,0.670472,0.66305,0.671251
3,100.0,0.667428,0.662288,0.669336
4,1000.0,0.667428,0.662288,0.669336


In [34]:
np.save("sigmoid",np.array(df))

# Best C with Poly Kernel

In [3]:
model = SVC(C=currentC, kernel='poly') 

NameError: name 'SVC' is not defined

In [63]:
%%time
model.fit(X_train, y_train) #one run takes 30+ mins !

CPU times: user 29min 11s, sys: 24 ms, total: 29min 11s
Wall time: 29min 11s


SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [41]:
model.score (X_test, y_test)

0.7952815829528158

In [42]:
model.score (X_train, y_train)

0.791738054445079

In [43]:
model.predict(X_test)

array(['Average', 'Average', 'Average', ..., 'Average', 'Average',
       'Average'], dtype=object)

In [44]:
f1_score(y_test,y_pred,average='weighted')

0.7045945351298791

In [21]:
#reset data from scaling
X_train, X_test, y_train, y_test= train_test_split(X, Y, test_size=0.20, random_state=101)

In [22]:
# running loop with poly and unscaled data -- this will take forever

In [23]:
%%time

C_params = [0.1,1,10,100,1000]

test_score = []
train_score = []
f1_scores = []

for currentC in C_params: 
        model = SVC(C=currentC, kernel='poly')        
        model.fit(X_train, y_train) 
        test_score.append(model.score (X_test, y_test))
        train_score.append(model.score (X_train, y_train))
        y_pred =model.predict(X_test)
        f1_scores.append(f1_score(y_test,y_pred,average='weighted'))
        
df = pd.DataFrame(list(zip(C_params,test_score,train_score,f1_scores)),columns = ['C','test_score','train_score','f1_scores'])

CPU times: user 1h 27min 26s, sys: 176 ms, total: 1h 27min 26s
Wall time: 1h 27min 27s


In [24]:
df.head()

Unnamed: 0,C,test_score,train_score,f1_scores
0,0.1,0.795282,0.791738,0.704595
1,1.0,0.795282,0.791738,0.704595
2,10.0,0.795282,0.791738,0.704595
3,100.0,0.795282,0.791738,0.704595
4,1000.0,0.795282,0.791738,0.704595


In [25]:
np.save("unscaledPolynomial",np.array(df))

# Ignore: Scaling data to improve performance of polynomial SVM

In [5]:
#lets try scaling the data to improve performance of polynomial kernel svm
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train = scaling.transform(X_train)
X_test = scaling.transform(X_test) #X_test must be transformed as per fit of X_train

In [6]:
model = SVC(C=0.1, kernel='poly') 

In [7]:
%%time
model.fit(X_train, y_train) #takes 25 seconds only! much better performance

CPU times: user 441 ms, sys: 28.2 ms, total: 469 ms
Wall time: 468 ms


SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [8]:
#better performance because: 
# reducing the possible boundary space for each option in a way that makes the level of effort much less for your machine

In [10]:
%%time

C_params = [0.1,1,10,100,1000]

test_score = []
train_score = []
f1_scores = []

for currentC in C_params: 
        model = SVC(C=currentC, kernel='poly')        
        model.fit(X_train, y_train) 
        test_score.append(model.score (X_test, y_test))
        train_score.append(model.score (X_train, y_train))
        y_pred =model.predict(X_test)
        f1_scores.append(f1_score(y_test,y_pred,average='weighted'))
        
df = pd.DataFrame(list(zip(C_params,test_score,train_score,f1_scores)),columns = ['C','test_score','train_score','f1_scores'])

CPU times: user 43 s, sys: 89.9 ms, total: 43.1 s
Wall time: 43.1 s


In [11]:
df.head()

Unnamed: 0,C,test_score,train_score,f1_scores
0,0.1,0.795282,0.791738,0.704595
1,1.0,0.796043,0.791738,0.706398
2,10.0,0.802131,0.797259,0.736648
3,100.0,0.802131,0.797449,0.740047
4,1000.0,0.802892,0.800305,0.738726


In [13]:
np.save("minmaxScaledPolynomial",np.array(df))

In [None]:
#reset data from scaling
X_train, X_test, y_train, y_test= train_test_split(X, Y, test_size=0.20, random_state=101)

In [37]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler(quantile_range=(25, 75)).fit(X_train)
X_train = scaler.transform(X_train);
X_test = scaler.transform(X_test) #X_test must be transformed as per fit of X_train

In [None]:
%%time

C_params = [0.1,1,10,100,1000]

test_score = []
train_score = []
f1_scores = []
for currentC in C_params: 
        model = SVC(C=currentC, kernel='poly')        
        model.fit(X_train, y_train) 
        test_score.append(model.score (X_test, y_test))
        train_score.append(model.score (X_train, y_train))
        y_pred =model.predict(X_test)
        f1_scores.append(f1_score(y_test,y_pred,average='weighted'))
        
df = pd.DataFrame(list(zip(C_params,test_score,train_score,f1_scores)),columns = ['C','test_score','train_score','f1_scores'])

In [None]:
np.save("robustScaledPolynomial",np.array(df))

In [14]:
#reset data from scaling
X_train, X_test, y_train, y_test= train_test_split(X, Y, test_size=0.20, random_state=101)

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train);
X_test = scaler.transform(X_test) #X_test must be transformed as per fit of X_train

In [16]:
%%time

C_params = [0.1,1,10,100,1000]

test_score = []
train_score = []
f1_scores = []

for currentC in C_params: 
        model = SVC(C=currentC, kernel='poly')        
        model.fit(X_train, y_train) 
        test_score.append(model.score (X_test, y_test))
        train_score.append(model.score (X_train, y_train))
        y_pred =model.predict(X_test)
        f1_scores.append(f1_score(y_test,y_pred,average='weighted'))
        
df = pd.DataFrame(list(zip(C_params,test_score,train_score,f1_scores)),columns = ['C','test_score','train_score','f1_scores'])

CPU times: user 15min 44s, sys: 40 ms, total: 15min 44s
Wall time: 15min 44s


In [17]:
df.head()

Unnamed: 0,C,test_score,train_score,f1_scores
0,0.1,0.797565,0.794594,0.709904
1,1.0,0.804414,0.801066,0.740678
2,10.0,0.805175,0.801637,0.742388
3,100.0,0.805936,0.802589,0.743184
4,1000.0,0.806697,0.802399,0.743696


In [18]:
np.save("standardScaledPolynomial",np.array(df))

In [None]:
#StandardScaler removes the mean and scales the data to unit variance. 
#However, the outliers have an influence when computing the 
#empirical mean and standard deviation which shrink the range of the feature values
#StandardScaler therefore cannot guarantee balanced feature scales in the presence of outliers.

#Standardization is a transformation that centers the data by removing the mean value of each feature 
#and then scale it by dividing (non-constant) features by their standard deviation. 
#After standardizing data the mean will be zero and the standard deviation one.

#Sklearn its main scaler, the StandardScaler, 
#uses a strict definition of standardization to standardize data. It purely centers the data

#MinMaxScaler rescales the data set such that all feature values are in the range [0, 1]
#As StandardScaler, MinMaxScaler is very sensitive to the presence of outliers.
#The MinMaxScaler transforms features by scaling each feature to a given range. 
#This range can be set by specifying the feature_range parameter (default at (0,1)). This scaler works better for cases where the distribution is not Gaussian or the standard deviation is very small. However, it is sensitive to outliers, so if there are outliers in the data, you might want to consider another scaler.

#Robust Scaler
#the centering and scaling statistics of this scaler are based on percentiles and are 
#therefore not influenced by a few number of very large marginal outliers. 
# Consequently, the resulting range of the transformed feature values is larger 
#than for the previous scalers and, more importantly, are approximately similar
#If your data contains many outliers, scaling using the mean and standard deviation of 
#the data is likely to not work very well. In these cases, you can use the RobustScaler. It removes the median and scales the data according to the quantile range

#Very often, it's enough to use StandardScaler to whiten the data,
#but sometimes it's better to consider the impact of noisy features on the global 
#trend and use RobustScaler to filter them out without the risk of conditioning the remaining features.

#https://towardsdatascience.com/preprocessing-with-sklearn-a-complete-and-comprehensive-guide-670cb98fcfb9


In [None]:
#Scaling and standardizing can help features arrive in more digestible form for these algorithms.

In [None]:
for currentC in C_params: 
        model = SVC(C=currentC, kernel='linear')
        model.fit(X_train, y_train) 
        test_score.append(model.score (X_test, y_test))
        train_score.append(model.score (X_train, y_train))
        y_pred =model.predict(X_test)
        f1_scores.append(f1_score(y_test,y_pred,average='weighted'))

In [None]:
for currentKernel in kernel_list:
    test_score = []
    train_score = []
    f1_scores = []
    for currentC in C_params: 
        model = SVC(C=currentC, kernel=currentKernel)
        model.fit(X_train, y_train) 
        test_score.append(model.score (X_test, y_test))
        train_score.append(model.score (X_train, y_train))
        y_pred =model.predict(X_test)
        f1_scores.append(f1_score(y_test,y_pred,average='weighted'))
    df = pd.DataFrame(list(zip(C_params,test_score,train_score,f1_scores)),columns = ['C','test_score','train_score','f1_scores'])
    
    

In [None]:
df.head()

In [None]:
#Plot train and test score as C increases
sns.lineplot(x=df['C'],y=df['test_score'], color="g")
ax2 = plt.twinx()
sns.lineplot(x= df['C'],y=df['train_score'], color="b", ax=ax2, legend='brief')
#Label for train_score is right
#label for test_score (BLUE) is left

In [None]:
sns.lineplot(x= df['C'],y=df['f1_scores'], color="g")
plt.show()

In [None]:
#Based on the f1_score, lets find the best value for c
print("Best Row based on f1_score")
maxF1Score = df['f1_scores'].idxmax()
print(df.iloc[maxF1Score])
best_min_samples_split = df['C'].values[maxF1Score]

# Ignore: Playing around with Gridsearch

In [None]:
param_grid = {
    'C': [0.1,1,10,100,1000],
    'gamma': [1,0.1,0.01,0.001,0.001],
    'kernel': ['rbf', 'linear', 'polynomial','sigmoid'],
    'degree': [2,3,4,5,6,7],
    'coef0':[0.0, 1.0,10.0,100.0,1000]
}

In [None]:


# gamma -- free param of rbf function (>0)
# small gamma -- gaussian of large variance
# large gamma -- high bias, low variance--support vector doesnt 
#have widespread influence

# coef0 for polynomial and simoid
# degree for polynomial
# gamma for rbf
# linear -- nothing

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid = GridSearchCV (SVC(), param_grid, refit=True, verbose =3 )

In [None]:
grid.fit(X_train, y_train)