In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # for data visualization
import seaborn as sns # for statistical data visualization
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, hamming_loss
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

%matplotlib inline

import warnings

warnings.filterwarnings('ignore')

# Import Dataset
path = '/Users/vitthal/Documents/GitHub/marine-enhancement/Data/raw/boat_type_features.csv'




In [2]:
df = pd.read_csv(path)
print(df.shape)

(228, 15)


In [3]:
print(df.head())
# df.drop(['Unnamed: 0'], axis=1, inplace=True)
# df.drop(['Unnamed: 0.1'], axis=1, inplace=True)
df.drop(['mmsi'], axis=1, inplace=True)
print(df.head())

           mmsi  mean_speed  sd_speed  mean_course   sd_course  mean_dis_port  \
0  1.252340e+12    2.832586  4.649472   182.270427  118.752408   44072.984259   
1  5.145483e+12    1.249238  2.214813   220.598183  104.977158   78241.626524   
2  1.985955e+13    0.744803  2.029596   234.488085   86.170132   21762.805725   
3  3.491170e+13    1.538317  2.506843   146.193885  124.886371   60672.921862   
4  5.110130e+13    2.103552  2.662536   212.084623  101.589559   44856.501803   

   mean_dis_shore  area_covered  log_dis_shore  log_speed  log_dis_port  \
0    31216.217811    158.569940      -4.240884  -7.969092     -1.127925   
1    68632.563233     12.735326      -3.721242  -7.030998      9.336561   
2     2575.576851      0.141141      -9.169449 -10.490752      7.251667   
3    31330.991469      2.075710      -4.446348  -7.687318     10.099510   
4    11129.255970      5.257880      -2.348613  -6.106761      9.691107   

   mul_log_dis_shore_speed  mul_log_dis_port_speed  mul_dis_sh

In [4]:
# check for missing values in variables

df.isnull().sum()

mean_speed                 0
sd_speed                   0
mean_course                0
sd_course                  0
mean_dis_port              0
mean_dis_shore             0
area_covered               0
log_dis_shore              0
log_speed                  0
log_dis_port               0
mul_log_dis_shore_speed    0
mul_log_dis_port_speed     0
mul_dis_shore_speed        0
unique_id                  0
dtype: int64

In [5]:
#  Multi Output Classification

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df[['mean_speed', 'sd_speed', 'mean_course', 'sd_course' , 'mean_dis_port' , 'mean_dis_shore', 'area_covered' , 'log_dis_shore', 'log_speed', 'log_dis_port', 'mul_log_dis_shore_speed', 'mul_log_dis_port_speed', 'mul_dis_shore_speed' ]], df['unique_id'], test_size=0.2, random_state=42)

# Reshape y_train and y_test
# print(y_train)
# print(X_train)
# Reshape y_train and y_test
y_train = np.array(y_train).reshape(-1, 1)
y_test = np.array(y_test).reshape(-1, 1)


# Train the model
base_classifier = LogisticRegression()
model = ClassifierChain(base_classifier, order='random', random_state=42)
model.fit(X_train, y_train)


In [6]:
# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate accuracy and hamming loss
accuracy = accuracy_score(y_test, y_pred)
hamming_loss = hamming_loss(y_test, y_pred)

print("Accuracy:", accuracy)
print("Hamming Loss:", hamming_loss)


Accuracy: 0.5434782608695652
Hamming Loss: 0.45652173913043476


In [16]:
# Training the model using PCA csv
from sklearn.metrics import hamming_loss
# Import Dataset
path = '/Users/vitthal/Documents/GitHub/marine-enhancement/Data/csv/boat_type_pca.csv'

# Read the csv file
df_pca = pd.read_csv(path)

# Training the model
X_train, X_test, y_train, y_test = train_test_split(
    df_pca[['PC1', 'PC2', 'PC3' ]], df_pca['unique_id'], test_size=0.2, random_state=42)

# Reshape y_train and y_test
y_train = np.array(y_train).reshape(-1, 1)
y_test = np.array(y_test).reshape(-1, 1)

# Train the model
base_classifier = LogisticRegression()
model = ClassifierChain(base_classifier, order='random', random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate accuracy and hamming loss
accuracy = accuracy_score(y_test, y_pred)
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Accuracy:", accuracy)
print("Hamming Loss:", hamming_loss_value)


Accuracy: 0.6304347826086957
Hamming Loss: 0.3695652173913043


In [5]:
# # Convert boolean values to numeric (0 for 'false', 1 for 'True')
# le = LabelEncoder()
# df[''] = le.fit_transform(df['defects'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('geartype', axis=1), df['geartype'], test_size=0.2, random_state=42)

# Train an SVM classifier with OneVsRestClassifier for multi-label classification
clf = OneVsRestClassifier(SVC(kernel='linear'))
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)


In [None]:
# Print the classification report and accuracy score
print(classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))


In [7]:
X = df.drop(['speed'], axis=1)

y = df['geartype']

In [8]:
# split X and y into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [9]:
X_train.shape, X_test.shape

((189, 5), (48, 5))

Feature Scaling

In [10]:
cols = X_train.columns

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)


In [12]:
X_train = pd.DataFrame(X_train, columns=[cols])


In [13]:
X_test = pd.DataFrame(X_test, columns=[cols])

In [14]:
X_train.describe()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,mmsi,geartype,area
count,189.0,189.0,189.0,189.0,189.0
mean,-1.104349e-16,0.0,2.7021300000000003e-17,-6.461615e-18,-2.6433880000000002e-18
std,1.002656,0.0,1.002656,1.002656,1.002656
min,-1.710529,0.0,-1.582126,-2.733576,-0.204132
25%,-0.866278,0.0,-0.8659454,-0.2133523,-0.2040382
50%,-0.00747057,0.0,-0.1288527,-0.2133523,-0.2021169
75%,0.8658929,0.0,0.8675692,0.6267224,-0.1684021
max,1.710144,0.0,1.794583,1.466797,9.350985


run svm with default hyperparameters

In [None]:
# import SVC classifier
from sklearn.svm import SVC
from sklearn import preprocessing

labenc = preprocessing.LabelEncoder()
encoded = labenc.fit_transform(y_train)


# import metrics to compute accuracy
from sklearn.metrics import accuracy_score


# instantiate classifier with default hyperparameters
svc=SVC() 


# fit classifier to training set
svc.fit(X_train,encoded)


# make predictions on test set
y_pred=svc.predict(X_test)

# print(y_test,y_pred)

# compute and print accuracy score
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score with default hyperparameters: 0.9375


In [None]:
# instantiate classifier with rbf kernel and C=100
svc=SVC(C=100.0) 


# fit classifier to training set
svc.fit(X_train,y_train)


# make predictions on test set
y_pred=svc.predict(X_test)

# print(X_test,y_pred)


# compute and print accuracy score
print('Model accuracy score with rbf kernel and C=100.0 : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))


Model accuracy score with rbf kernel and C=100.0 : 0.9583


In [None]:
# instantiate classifier with linear kernel and C=100.0
linear_svc100=SVC(kernel='linear', C=100.0) 


# fit classifier to training set
linear_svc100.fit(X_train, y_train)


# make predictions on test set
y_pred=linear_svc100.predict(X_test)


# compute and print accuracy score
print('Model accuracy score with linear kernel and C=100.0 : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score with linear kernel and C=100.0 : 1.0000


In [None]:
y_pred_train = linear_svc.predict(X_train)

y_pred_train

array([3., 4., 5., 3., 3., 2., 3., 3., 4., 4., 3., 5., 5., 3., 5., 3., 2.,
       3., 3., 0., 3., 3., 3., 3., 3., 2., 3., 5., 5., 2., 1., 0., 3., 2.,
       2., 5., 3., 3., 3., 3., 2., 4., 3., 3., 5., 3., 3., 3., 4., 3., 3.,
       3., 5., 5., 4., 2., 2., 2., 3., 5., 2., 3., 2., 4., 5., 3., 5., 3.,
       3., 0., 3., 3., 5., 2., 5., 1., 3., 3., 5., 5., 3., 4., 3., 3., 3.,
       5., 3., 0., 3., 3., 3., 2., 2., 3., 5., 3., 2., 3., 3., 3., 3., 5.,
       5., 1., 3., 3., 3., 5., 4., 5., 5., 4., 3., 5., 1., 3., 3., 2., 3.,
       3., 0., 3., 5., 2., 3., 3., 5., 2., 2., 3., 3., 4., 4., 3., 2., 2.,
       4., 3., 4., 3., 2., 5., 3., 4., 5., 5., 2., 4., 3., 5., 3., 2., 4.,
       3., 3., 5., 4., 3., 5., 3., 3., 5., 3., 3., 2., 4., 3., 5., 4., 5.,
       2., 5., 3., 3., 3., 5., 3., 3., 2., 2., 5., 1., 3., 5., 3., 5., 3.,
       2., 4.])

In [None]:
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))

Training-set accuracy score: 1.0000


In [None]:
# print the scores on training and test set

print('Training set score: {:.4f}'.format(linear_svc.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(linear_svc.score(X_test, y_test)))

Training set score: 1.0000
Test set score: 1.0000


In [None]:
# instantiate classifier with polynomial kernel and C=1.0
poly_svc=SVC(kernel='poly', C=1.0) 


# fit classifier to training set
poly_svc.fit(X_train,y_train)


# make predictions on test set
y_pred=poly_svc.predict(X_test)


# compute and print accuracy score
print('Model accuracy score with polynomial kernel and C=1.0 : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score with polynomial kernel and C=1.0 : 0.8750


In [None]:
# instantiate classifier with sigmoid kernel and C=1.0
sigmoid_svc=SVC(kernel='sigmoid', C=1.0) 


# fit classifier to training set
sigmoid_svc.fit(X_train,y_train)


# make predictions on test set
y_pred=sigmoid_svc.predict(X_test)


# compute and print accuracy score
print('Model accuracy score with sigmoid kernel and C=1.0 : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score with sigmoid kernel and C=1.0 : 0.9167


In [None]:
# instantiate classifier with sigmoid kernel and C=100.0
sigmoid_svc100=SVC(kernel='sigmoid', C=100.0) 


# fit classifier to training set
sigmoid_svc100.fit(X_train,y_train)


# make predictions on test set
y_pred=sigmoid_svc100.predict(X_test)


# compute and print accuracy score
print('Model accuracy score with sigmoid kernel and C=100.0 : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score with sigmoid kernel and C=100.0 : 0.8125


In [None]:
# Print the Confusion Matrix and slice it into four pieces

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_test)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])


Confusion matrix

 [[ 4  0  0  0  0]
 [ 0  6  0  0  0]
 [ 0  0 23  0  0]
 [ 0  0  0  7  0]
 [ 0  0  0  0  8]]

True Positives(TP) =  4

True Negatives(TN) =  6

False Positives(FP) =  0

False Negatives(FN) =  0


In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00         4
         2.0       1.00      1.00      1.00         6
         3.0       1.00      1.00      1.00        23
         4.0       1.00      1.00      1.00         7
         5.0       1.00      1.00      1.00         8

    accuracy                           1.00        48
   macro avg       1.00      1.00      1.00        48
weighted avg       1.00      1.00      1.00        48



In [None]:
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]

In [None]:
# print classification accuracy

classification_accuracy = (TP + TN) / float(TP + TN + FP + FN)

print('Classification accuracy : {0:0.4f}'.format(classification_accuracy))

Classification accuracy : 1.0000


Classification error

In [None]:
# print classification error

classification_error = (FP + FN) / float(TP + TN + FP + FN)

print('Classification error : {0:0.4f}'.format(classification_error))

Classification error : 0.0000


Precision

In [None]:
# print precision score

precision = TP / float(TP + FP)


print('Precision : {0:0.4f}'.format(precision))

Precision : 1.0000


Sensitivity

In [None]:
recall = TP / float(TP + FN)

print('Recall or Sensitivity : {0:0.4f}'.format(recall))

Recall or Sensitivity : 1.0000


In [None]:
true_positive_rate = TP / float(TP + FN)


print('True Positive Rate : {0:0.4f}'.format(true_positive_rate))

True Positive Rate : 1.0000


In [None]:
false_positive_rate = FP / float(FP + TN)


print('False Positive Rate : {0:0.4f}'.format(false_positive_rate))

False Positive Rate : 0.0000


In [None]:
specificity = TN / (TN + FP)

print('Specificity : {0:0.4f}'.format(specificity))

Specificity : 1.0000


ROC-AUC Curves

In [None]:
# plot ROC Curve

from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, y_pred_test)

plt.figure(figsize=(6,4))

plt.plot(fpr, tpr, linewidth=2)

plt.plot([0,1], [0,1], 'k--' )

plt.rcParams['font.size'] = 12

plt.title('ROC curve for Predicting a Pulsar Star classifier')

plt.xlabel('False Positive Rate (1 - Specificity)')

plt.ylabel('True Positive Rate (Sensitivity)')

plt.show()

ValueError: ignored

In [None]:
# compute ROC AUC

from sklearn.metrics import roc_auc_score

ROC_AUC = roc_auc_score(y_test, y_pred_test)

print('ROC AUC : {:.4f}'.format(ROC_AUC))

ValueError: ignored

In [None]:
from sklearn.model_selection import cross_val_score

Cross_validated_ROC_AUC = cross_val_score(linear_svc, X_train, y_train, cv=10, scoring='roc_auc').mean()

print('Cross validated ROC AUC : {:.4f}'.format(Cross_validated_ROC_AUC))

Cross validated ROC AUC : nan


Stratfied K-Fold Cross Validation With shuffle split

In [None]:
from sklearn.model_selection import KFold


kfold=KFold(n_splits=5, shuffle=True, random_state=0)


linear_svc=SVC(kernel='linear')


linear_scores = cross_val_score(linear_svc, X, y, cv=kfold)

In [None]:
print('Stratified cross-validation scores with linear kernel:\n\n{}'.format(linear_scores))

In [None]:
# print average cross-validation score with linear kernel

print('Average stratified cross-validation score with linear kernel:{:.4f}'.format(linear_scores.mean()))

In [None]:
rbf_svc=SVC(kernel='rbf')


rbf_scores = cross_val_score(rbf_svc, X, y, cv=kfold)

In [None]:
# print cross-validation scores with rbf kernel

print('Stratified Cross-validation scores with rbf kernel:\n\n{}'.format(rbf_scores))


In [None]:
# print average cross-validation score with rbf kernel

print('Average stratified cross-validation score with rbf kernel:{:.4f}'.format(rbf_scores.mean()))

HyperParameter Optimization using GridSearch CV


In [None]:
# import GridSearchCV
from sklearn.model_selection import GridSearchCV


# import SVC classifier
from sklearn.svm import SVC


# instantiate classifier with default hyperparameters with kernel=rbf, C=1.0 and gamma=auto
svc=SVC() 



# declare parameters for hyperparameter tuning
parameters = [ {'C':[1, 10, 100, 1000], 'kernel':['linear']},
               {'C':[1, 10, 100, 1000], 'kernel':['rbf'], 'gamma':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]},
               {'C':[1, 10, 100, 1000], 'kernel':['poly'], 'degree': [2,3,4] ,'gamma':[0.01,0.02,0.03,0.04,0.05]} 
              ]




grid_search = GridSearchCV(estimator = svc,  
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 5,
                           verbose=0)


grid_search.fit(X_train, y_train)


In [None]:
# examine the best model


# best score achieved during the GridSearchCV
print('GridSearch CV best score : {:.4f}\n\n'.format(grid_search.best_score_))


# print parameters that give the best results
print('Parameters that give the best results :','\n\n', (grid_search.best_params_))


# print estimator that was chosen by the GridSearch
print('\n\nEstimator that was chosen by the search :','\n\n', (grid_search.best_estimator_))

In [None]:
# calculate GridSearch CV score on test set

print('GridSearch CV score on test set: {0:0.4f}'.format(grid_search.score(X_test, y_test)))