# SVM Model

# Library Imports

In [1]:
# import numpy and pandas libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
# set random seed to ensure that results are repeatable
np.random.seed(1)

# Data Load

In [2]:
ownership=pd.read_csv("C:/Users/mssur/Downloads/RidingMowers.csv")

In [3]:
ownership

Unnamed: 0,Income,Lot_Size,Ownership
0,60.0,18.4,Owner
1,85.5,16.8,Owner
2,64.8,21.6,Owner
3,61.5,20.8,Owner
4,87.0,23.6,Owner
5,110.1,19.2,Owner
6,108.0,17.6,Owner
7,82.8,22.4,Owner
8,69.0,20.0,Owner
9,93.0,20.8,Owner


In [4]:
# look at the data
ownership.head() # note that we don't want to dump all the data to the screen

Unnamed: 0,Income,Lot_Size,Ownership
0,60.0,18.4,Owner
1,85.5,16.8,Owner
2,64.8,21.6,Owner
3,61.5,20.8,Owner
4,87.0,23.6,Owner


In [5]:
# generate a statistical summary of the numeric value in the data
ownership.describe()

Unnamed: 0,Income,Lot_Size
count,24.0,24.0
mean,68.4375,18.95
std,19.793144,2.428275
min,33.0,14.0
25%,52.35,17.5
50%,64.8,19.0
75%,83.1,20.8
max,110.1,23.6


In [6]:
# Check the missing values by summing the total na's for each variable
ownership.isna().sum()

Income       0
Lot_Size     0
Ownership    0
dtype: int64

In [7]:
ownership['Ownership'].unique

<bound method Series.unique of 0        Owner
1        Owner
2        Owner
3        Owner
4        Owner
5        Owner
6        Owner
7        Owner
8        Owner
9        Owner
10       Owner
11       Owner
12    Nonowner
13    Nonowner
14    Nonowner
15    Nonowner
16    Nonowner
17    Nonowner
18    Nonowner
19    Nonowner
20    Nonowner
21    Nonowner
22    Nonowner
23    Nonowner
Name: Ownership, dtype: object>

# Categorical variable encoding

In [8]:
data_dummy = pd.get_dummies(ownership['Ownership'], prefix='Ownership', drop_first=True)
ownership = ownership.join(data_dummy)
ownership.drop('Ownership', axis=1, inplace = True)

In [9]:
ownership.head()

Unnamed: 0,Income,Lot_Size,Ownership_Owner
0,60.0,18.4,1
1,85.5,16.8,1
2,64.8,21.6,1
3,61.5,20.8,1
4,87.0,23.6,1


In [10]:
ownership['Ownership_Owner'].unique

<bound method Series.unique of 0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
Name: Ownership_Owner, dtype: uint8>

# Train data Test data separation

In [11]:
X = ownership.loc[:,['Income','Lot_Size']]
y = ownership.loc[:,['Ownership_Owner']]

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3333)

# Data Frame for storing metrics

In [13]:
df_metrics = pd.DataFrame({"model": [],  "Precision": [], "Accuracy": [], "F1": [],"Recall": []})

# SVM classification 

In [14]:
linear_svm = SVC(kernel="linear",probability=True)
svm_out1 = linear_svm.fit(X_train, np.ravel(y_train))

In [15]:
svclin = linear_svm.predict(X_test)
cm = confusion_matrix(y_test, svclin)
TP = cm[1][1]
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
df_metrics = pd.concat([df_metrics, pd.DataFrame({'model':"linear svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

In [16]:
ownership["predicted"]=svm_out1.predict(X)
ownership

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted
0,60.0,18.4,1,0
1,85.5,16.8,1,1
2,64.8,21.6,1,1
3,61.5,20.8,1,1
4,87.0,23.6,1,1
5,110.1,19.2,1,1
6,108.0,17.6,1,1
7,82.8,22.4,1,1
8,69.0,20.0,1,1
9,93.0,20.8,1,1


In [17]:
ownership['pred_prob'] = svm_out1.predict_proba(X)[:,1]
ownership

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.477774
1,85.5,16.8,1,1,0.643367
2,64.8,21.6,1,1,0.643499
3,61.5,20.8,1,1,0.583284
4,87.0,23.6,1,1,0.842604
5,110.1,19.2,1,1,0.861159
6,108.0,17.6,1,1,0.81796
7,82.8,22.4,1,1,0.792842
8,69.0,20.0,1,1,0.619244
9,93.0,20.8,1,1,0.811532


# SVM model by using RBF

In [18]:
rbf_svm = SVC(kernel="rbf", C=10, gamma='scale',probability=True)
rbf_svc = rbf_svm.fit(X_train, np.ravel(y_train))

In [19]:
rbf_svc_out = rbf_svm.predict(X_test)
cm = confusion_matrix(y_test, rbf_svc_out)
TP = cm[1][1]
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
df_metrics = pd.concat([df_metrics, pd.DataFrame({'model':"rbf svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

In [20]:
df_metrics

Unnamed: 0,model,Precision,Accuracy,F1,Recall
0,linear svm,1.0,1.0,1.0,1.0
0,rbf svm,0.666667,0.75,0.666667,0.666667


In [21]:
ownership["predicted"]=rbf_svc.predict(X)
ownership

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.477774
1,85.5,16.8,1,1,0.643367
2,64.8,21.6,1,1,0.643499
3,61.5,20.8,1,1,0.583284
4,87.0,23.6,1,1,0.842604
5,110.1,19.2,1,1,0.861159
6,108.0,17.6,1,1,0.81796
7,82.8,22.4,1,1,0.792842
8,69.0,20.0,1,1,0.619244
9,93.0,20.8,1,1,0.811532


In [22]:
ownership['pred_prob'] = rbf_svc.predict_proba(X)[:,1]
ownership

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.382182
1,85.5,16.8,1,1,0.738629
2,64.8,21.6,1,1,0.567431
3,61.5,20.8,1,1,0.476066
4,87.0,23.6,1,1,0.83206
5,110.1,19.2,1,1,0.759051
6,108.0,17.6,1,1,0.75904
7,82.8,22.4,1,1,0.804546
8,69.0,20.0,1,1,0.60839
9,93.0,20.8,1,1,0.813595


# SVM model by using polynomial kernal

In [23]:
poly_svm = SVC(kernel="poly", degree=3, coef0=1.0, C=5,probability=True)
poly_svm_svc= poly_svm.fit(X_train, np.ravel(y_train))

In [24]:
svc_poly_out = poly_svm.predict(X_test)
cm = confusion_matrix(y_test, svc_poly_out)
TP = cm[1][1]
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
df_metrics = pd.concat([df_metrics, pd.DataFrame({'model':"poly svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

In [25]:
df_metrics

Unnamed: 0,model,Precision,Accuracy,F1,Recall
0,linear svm,1.0,1.0,1.0,1.0
0,rbf svm,0.666667,0.75,0.666667,0.666667
0,poly svm,1.0,0.875,0.8,0.666667


In [26]:
ownership["predicted"]=poly_svm_svc.predict(X)
ownership

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.382182
1,85.5,16.8,1,1,0.738629
2,64.8,21.6,1,1,0.567431
3,61.5,20.8,1,1,0.476066
4,87.0,23.6,1,1,0.83206
5,110.1,19.2,1,1,0.759051
6,108.0,17.6,1,1,0.75904
7,82.8,22.4,1,1,0.804546
8,69.0,20.0,1,1,0.60839
9,93.0,20.8,1,1,0.813595


In [27]:
ownership['pred_prob'] = poly_svm_svc.predict_proba(X)[:,1]
ownership

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predicted,pred_prob
0,60.0,18.4,1,0,0.535093
1,85.5,16.8,1,1,0.584568
2,64.8,21.6,1,1,0.625607
3,61.5,20.8,1,1,0.590145
4,87.0,23.6,1,1,0.768301
5,110.1,19.2,1,1,0.682744
6,108.0,17.6,1,1,0.625611
7,82.8,22.4,1,1,0.724824
8,69.0,20.0,1,1,0.611721
9,93.0,20.8,1,1,0.715259


In [28]:
df_metrics

Unnamed: 0,model,Precision,Accuracy,F1,Recall
0,linear svm,1.0,1.0,1.0,1.0
0,rbf svm,0.666667,0.75,0.666667,0.666667
0,poly svm,1.0,0.875,0.8,0.666667


In [29]:
df_metrics.sort_values(by=['Accuracy'])

Unnamed: 0,model,Precision,Accuracy,F1,Recall
0,rbf svm,0.666667,0.75,0.666667,0.666667
0,poly svm,1.0,0.875,0.8,0.666667
0,linear svm,1.0,1.0,1.0,1.0


In [30]:
df_metrics.sort_values(by=['Precision'])

Unnamed: 0,model,Precision,Accuracy,F1,Recall
0,rbf svm,0.666667,0.75,0.666667,0.666667
0,linear svm,1.0,1.0,1.0,1.0
0,poly svm,1.0,0.875,0.8,0.666667


In [31]:
df_metrics.sort_values(by=['Recall'])

Unnamed: 0,model,Precision,Accuracy,F1,Recall
0,rbf svm,0.666667,0.75,0.666667,0.666667
0,poly svm,1.0,0.875,0.8,0.666667
0,linear svm,1.0,1.0,1.0,1.0


In [32]:
df_metrics.sort_values(by=['F1'])

Unnamed: 0,model,Precision,Accuracy,F1,Recall
0,rbf svm,0.666667,0.75,0.666667,0.666667
0,poly svm,1.0,0.875,0.8,0.666667
0,linear svm,1.0,1.0,1.0,1.0


# Inference:

1)From the above results it can be clearly visible that linear svm model is overfitting as we have all the metrics values which is 1.0
2)Over training data set is having 24 samples in which 12 are owner and 12 are non owner 
3)Since data is perefectly balanced as per my knowledge accuracy will be the best metric
4)By considering the accuracy for each model polysvm will be the best model 
5)The input variables income and lot size are having same range of values so I didn't consider the standarization

In [39]:
import pickle

# save model
pickle.dump(poly_svm_svc, open("E:/Spring-23/DSP/week3/finalout.pkl", "wb"))

# If you wish to load this model later, simply use pickle.load method
#loaded_model = pickle.load(open('logistic_model_example01.pkl', "rb"))