In [117]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
plt.rcParams.update({
    "figure.figsize" : (16,8),
    "axes.grid" : True
})

plt.style.use('dark_background')

from sklearn.model_selection import train_test_split
import sklearn.preprocessing as ppUtil
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

In [26]:
## prepare dataset

from sklearn.datasets import load_breast_cancer

dataset = load_breast_cancer()
xDF = pd.DataFrame(dataset.data,columns=dataset.feature_names)
yDF = pd.DataFrame(dataset.target,columns=['target'])

xTrain,xTest,yTrain,yTest = train_test_split(xDF.values,yDF.values,random_state = 42)

In [32]:
## Nearest Neighbors Algorithm
from sklearn.neighbors import KNeighborsClassifier

knnClassifier = KNeighborsClassifier()

knnModel = knnClassifier.fit(xTrain,yTrain)

print(f"Training Accuracy : {knnModel.score(xTrain,yTrain)}")
print(f"Testing Accuracy : {knnModel.score(xTest,yTest)}")

Training Accuracy : 0.9342723004694836
Testing Accuracy : 0.965034965034965
  knnModel = knnClassifier.fit(xTrain,yTrain)


In [49]:
## Decision Tree
from sklearn.tree import DecisionTreeClassifier

## Fine Tuning to overcome overfitting --> max_depth = 2
dtClassifier = DecisionTreeClassifier(max_depth = 2)

dtModel = dtClassifier.fit(xTrain,yTrain)

print(f"Training Accuracy : {dtModel.score(xTrain,yTrain)}")
print(f"Testing Accuracy : {dtModel.score(xTest,yTest)}")

Training Accuracy : 0.9460093896713615
Testing Accuracy : 0.916083916083916


### Ensemble Methods

Ensemble methods combine predictions of other learning algorithms, to improve the generalization.

Ensemble methods are two types:

- Averaging Methods: They build several base estimators independently and finally average their predictions.
        E.g.: Bagging Methods, Forests of randomised trees
- Boosting Methods: They build base estimators sequentially and try to reduce the bias of the combined estimator.
        E.g.: Adaboost, Gradient Tree Boosting



In [59]:
from sklearn.ensemble import RandomForestClassifier 

rfClassifier = RandomForestClassifier()

rfModel = rfClassifier.fit(xTrain,yTrain)

print(f"Training Accuracy : {rfModel.score(xTrain,yTrain)}")
print(f"Testing Accuracy : {rfModel.score(xTest,yTest)}")

  rfModel = rfClassifier.fit(xTrain,yTrain)
Training Accuracy : 1.0
Testing Accuracy : 0.972027972027972


### SVM

Support Vector Machines (SVMs) separates data points based on decision planes, which separates objects belonging to different classes in a higher dimensional space.

    SVM algorithm uses the best suitable kernel, which is capable of separating data points into two or more classes.

    Commonly used kernels are:
- linear
- polynomial
- rbf
- sigmoid



In [129]:
from sklearn.svm import SVC

svmClassifier = make_pipeline(ppUtil.StandardScaler(),SVC(gamma="auto",verbose=2))

svmModel = svmClassifier.fit(xTrain,yTrain)


print(f"Training Accuracy : {svmModel.score(xTrain,yTrain)}")
print(f"Testing Accuracy : {svmModel.score(xTest,yTest)}")

[LibSVM]Training Accuracy : 0.9882629107981221
Testing Accuracy : 0.972027972027972
  return f(**kwargs)


In [140]:
svmModel[1].support_vectors_

array([[ 0.32780248,  0.75989417,  0.28979245, ...,  0.15456812,
         0.48294107,  0.8185546 ],
       [ 0.27681948,  0.66760365,  0.22141192, ...,  0.62696257,
        -0.3300075 , -0.26286225],
       [ 2.30764239,  0.1114941 ,  2.50927638, ...,  2.77138471,
         1.91373055,  0.82823553],
       ...,
       [-0.27832876,  0.36470143, -0.24324619, ...,  0.08464131,
        -0.51210798, -0.09145284],
       [-1.48266056, -1.07881697, -1.36328643, ..., -1.00683848,
        -1.0145102 ,  1.42674481],
       [-0.70318711, -0.20560666, -0.68854354, ...,  0.14213669,
        -0.11701498,  0.43416472]])

### Classification Report

Need to provide both prediction and test data

In [119]:

yPred = svmModel.predict(xTest)
print(classification_report(yTest, yPred))


              precision    recall  f1-score   support

           0       0.96      0.96      0.96        54
           1       0.98      0.98      0.98        89

    accuracy                           0.97       143
   macro avg       0.97      0.97      0.97       143
weighted avg       0.97      0.97      0.97       143

