# support vector machines using sklearn (for penguins' dataset from the book - 01_Machine-Learning-for-Absolute-Beginners.pdf by Oliver Theobald

In [1]:
# support vector machines (SVM) was initially designed for predicting numeric and
# categorical outcomes as a double-barrel prediction technique. Today,
# though, SVM is mostly used as a classification technique for predicting categorical outcomes.

In [2]:
# Logistic Regression versus SVM

<img class="center" src="./data_files/svm_vs_logistic_regression.png"/>

In [6]:
# When a new data point is added to the scatter plot

<img class="center" src="./data_files/svm_new_data_point_added.png" />

In [8]:
# There is therefore a trade-off in SVM
# between a wide margin/more mistakes and a narrow margin/fewer
# mistakes . The higher goal of your model is to strike a balance between
# "not too strict" and "not too loose", and, by modifying the C
# hyperparameter, you can regulate to what extent the misclassified cases (on
# the wrong side of the margin) are ignored.

In [9]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [10]:
# Import dataset
df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv')

In [11]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [14]:
# Drop missing rows
df.dropna(axis = 0, how = 'any',subset = None, inplace = True)

In [15]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE


In [16]:
# Convert non-numeric data using one-hot encoding
df = pd.get_dummies(df, columns=['sex', 'island'])

In [17]:
df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex_FEMALE,sex_MALE,island_Biscoe,island_Dream,island_Torgersen
0,Adelie,39.1,18.7,181.0,3750.0,False,True,False,False,True
1,Adelie,39.5,17.4,186.0,3800.0,True,False,False,False,True
2,Adelie,40.3,18.0,195.0,3250.0,True,False,False,False,True
4,Adelie,36.7,19.3,193.0,3450.0,True,False,False,False,True
5,Adelie,39.3,20.6,190.0,3650.0,False,True,False,False,True


In [18]:
# Scale independent variables by dropping the dependent variable (sex)
scaler = StandardScaler()
scaler.fit(df.drop('species',axis=1))
scaled_features = scaler.transform(df.drop('species',axis=1))

In [19]:
# Assign X and y variables
X = scaled_features
y = df['species']

In [20]:
# Split data into test and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [21]:
# Assign algorithm 
model = SVC()

In [25]:
# Fit algorithm to data
model.fit(X_train, y_train)

# Run algorithm on test data to make predictions
model_test = model.predict(X_test)

In [26]:
# Evaluate predictions
print(confusion_matrix(y_test, model_test)) 
print(classification_report(y_test, model_test))

[[44  1  0]
 [ 0 19  0]
 [ 0  0 36]]
              precision    recall  f1-score   support

      Adelie       1.00      0.98      0.99        45
   Chinstrap       0.95      1.00      0.97        19
      Gentoo       1.00      1.00      1.00        36

    accuracy                           0.99       100
   macro avg       0.98      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100



In [24]:
# Data point to predict
penguin = [
	39, #bill_length_mm
	18.5, #bill_depth_mm
	180, #flipper_length_mm 
	3750, #body_mass_g
	0, #island_Biscoe    
	0, #island_Dream
	1, #island_Torgersen    
	1, #sex_Male
	0, #sex_Female
]

# Make prediction
new_penguin = model.predict([penguin])
new_penguin

array(['Adelie'], dtype=object)