In [1]:
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn import svm
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [3]:
import pickle

In [4]:
data=pd.read_csv("Breast Cancer Data.csv")

In [284]:
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [285]:
data.shape

(569, 33)

In [286]:
df=data.drop("Unnamed: 32",axis=1)
df=data.drop("id",axis=1)


In [287]:
df.shape

(569, 32)

In [288]:
df.drop(columns=["Unnamed: 32"],inplace=True)

In [289]:
df.shape

(569, 31)

In [290]:
df["diagnosis"].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [291]:
df.diagnosis=df.diagnosis.astype("category")

In [292]:
df.diagnosis

0      M
1      M
2      M
3      M
4      M
      ..
564    M
565    M
566    M
567    M
568    B
Name: diagnosis, Length: 569, dtype: category
Categories (2, object): [B, M]

In [293]:
df.shape

(569, 31)

In [294]:
X=df.drop(labels="diagnosis",axis=1)
Y=df["diagnosis"]
col=X.columns

In [295]:
col

Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [296]:
X.shape

(569, 30)

## Feature Engineering

In [297]:
X.isnull().sum()

radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

In [298]:
df_norm=(X-X.mean())/(X.max()-X.min())
df_norm=pd.concat([df_norm,Y],axis=1)

In [299]:
df_norm.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
0,0.182815,-0.301307,0.213053,0.146813,0.198968,0.531437,0.495081,0.487976,0.306758,0.335139,...,-0.222474,0.385173,0.279792,0.196998,0.399079,0.351207,0.518192,0.335156,0.229268,M
1,0.304923,-0.051392,0.282848,0.284671,-0.104905,-0.078833,-0.00445,0.105621,0.000193,-0.129057,...,-0.060427,0.25668,0.264308,-0.056585,-0.065649,-0.024432,0.245339,-0.029717,0.033282,M
2,0.263274,0.066295,0.262808,0.232497,0.119524,0.170416,0.254453,0.392549,0.129991,-0.059132,...,-0.003924,0.225304,0.203602,0.079452,0.165163,0.142341,0.441216,0.140399,0.023837,M
3,-0.128132,0.036874,-0.099434,-0.114014,0.416536,0.550761,0.357546,0.279726,0.396657,0.729621,...,0.021929,-0.041791,-0.076898,0.511335,0.593799,0.331239,0.491044,0.736693,0.584115,M
4,0.291671,-0.167388,0.298051,0.272369,0.035567,0.087292,0.255859,0.275253,-0.001323,-0.083564,...,-0.240065,0.22381,0.170669,0.033226,-0.047797,0.102086,0.164583,-0.105806,-0.047001,M


In [300]:
Y.head()

0    M
1    M
2    M
3    M
4    M
Name: diagnosis, dtype: category
Categories (2, object): [B, M]

In [301]:
X_norm=df_norm.drop(labels="diagnosis",axis=1)
Y_norm=df_norm["diagnosis"]
col=X_norm.columns

In [302]:
le=LabelEncoder()

In [303]:
le.fit(Y_norm)

LabelEncoder()

In [304]:
Y_norm=le.transform(Y_norm)

In [305]:
Y_norm=pd.DataFrame(Y_norm)
Y_norm.tail()

Unnamed: 0,0
564,1
565,1
566,1
567,1
568,0


In [306]:
X_norm.shape

(569, 30)

## Fitting the model

In [259]:
def FitModel(X,Y,algo_name,algorithm,gridSearchParams,cv):
    from sklearn.model_selection import train_test_split

    np.random.seed(10)
    x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2)
    
    grid=GridSearchCV(
    estimator=algorithm,
    param_grid=gridSearchParams,
    cv=cv,scoring="accuracy",verbose=1,n_jobs=-1)
    
    grid_result=grid.fit(x_train,y_train)
    best_params=grid_result.best_params_
    pred=grid_result.predict(x_test)
    cm=confusion_matrix(y_test,pred)
    print(pred)
    pickle.dump(grid_result,open(algo_name,"wb"))
    
    print("Best Params:",best_params)
    print("Classification Report",classification_report(y_test,pred))
    print("Accuracy Score:"+ str(accuracy_score(y_test,pred)))
    print("Confusion Matrix:\n",cm)

## SVC ML Model

In [260]:
param={"C":[0.1,1,100,1000],
       "gamma":[0.0001,0.001,0.005,0.1,1,3,5]}
FitModel(X_norm,Y_norm,"SVC",SVC(),param,cv=5)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[1 0 0 1 0 1 0 0 1 1 0 0 1 0 1 1 0 0 0 0 1 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 0 0 0 1 0 1 1 0 0 0 0 1 1 0 0 0 0
 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 0 0 0 1 1 0 0 0
 0 1 1]
Best Params: {'C': 1, 'gamma': 1}
Classification Report               precision    recall  f1-score   support

           0       1.00      0.96      0.98        75
           1       0.93      1.00      0.96        39

    accuracy                           0.97       114
   macro avg       0.96      0.98      0.97       114
weighted avg       0.98      0.97      0.97       114

Accuracy Score:0.9736842105263158
Confusion Matrix:
 [[72  3]
 [ 0 39]]


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:    0.2s finished
  y = column_or_1d(y, warn=True)


## RandomForest

In [205]:
param={"n_estimators":[100,500,1000,2000]}
FitModel(X,Y,"RandomForest",RandomForestClassifier(),param,cv=10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 4 candidates, totalling 40 fits
['M' 'B' 'B' 'M' 'B' 'B' 'B' 'B' 'M' 'M' 'B' 'B' 'M' 'B' 'M' 'M' 'B' 'B'
 'B' 'M' 'M' 'M' 'B' 'B' 'M' 'M' 'B' 'M' 'B' 'B' 'B' 'B' 'M' 'B' 'B' 'B'
 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'M' 'B' 'M' 'M' 'M' 'M' 'B' 'B' 'B' 'M' 'B'
 'B' 'M' 'M' 'B' 'B' 'B' 'M' 'B' 'M' 'B' 'B' 'B' 'B' 'B' 'M' 'M' 'B' 'B'
 'B' 'B' 'B' 'M' 'B' 'B' 'M' 'M' 'B' 'B' 'M' 'B' 'B' 'B' 'B' 'M' 'B' 'B'
 'B' 'B' 'M' 'B' 'B' 'B' 'M' 'B' 'M' 'B' 'M' 'M' 'B' 'B' 'B' 'B' 'M' 'M'
 'B' 'B' 'B' 'B' 'M' 'M']
Best Params: {'n_estimators': 100}
Classification Report               precision    recall  f1-score   support

           B       0.99      0.97      0.98        75
           M       0.95      0.97      0.96        39

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

Accuracy Score:0.9736842105263158
Confusion Matrix:
 [[73  2]
 [ 1 38]]


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   11.6s finished


## XGBOOST

In [207]:
param={"n_estimators":[100,500,1000,2000]}
FitModel(X_norm,Y_norm,"XGBoost11",XGBClassifier(),param,cv=5)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[1 0 0 1 0 0 0 0 1 1 0 0 1 0 1 1 0 0 0 1 1 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0
 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 0 0 0 1 1 0 0 0
 0 1 1]
Best Params: {'n_estimators': 500}
Classification Report               precision    recall  f1-score   support

           0       0.97      0.97      0.97        75
           1       0.95      0.95      0.95        39

    accuracy                           0.96       114
   macro avg       0.96      0.96      0.96       114
weighted avg       0.96      0.96      0.96       114

Accuracy Score:0.9649122807017544
Confusion Matrix:
 [[73  2]
 [ 2 37]]


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.9s finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


## Reloading the saved model

In [261]:
loaded_model=pickle.load(open("SVC","rb"))

In [262]:
pred1=loaded_model.predict(x_test)
pred1

array([1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1])

In [263]:
loaded_model.best_params_

{'C': 1, 'gamma': 1}

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from time import time

def train_svm():

    data=pd.read_csv("Breast Cancer Data.csv")
    df=data.drop("Unnamed: 32",axis=1)
    df=data.drop("id",axis=1)

    df.drop(columns=["Unnamed: 32"],inplace=True)
    X=df.drop(labels="diagnosis",axis=1)
    y=df["diagnosis"]

    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    labelencoder_X_1 = LabelEncoder()
    y = labelencoder_X_1.fit_transform(y)

    global X_test, y_test
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


    from sklearn.preprocessing import StandardScaler
    global sc
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    clf = SVC(probability=True)
    clf.fit(X_train, y_train)

    return clf

def test_svm(clf):
    output = clf.predict(X_test)
    acc = accuracy_score(y_test, output) 
    print("The accuracy of testing data: ",acc)

def predict_svm(clf, inp):
    inp = sc.transform(inp)
    output = clf.predict(inp)
    acc = clf.predict_proba(inp)

    return output, acc