#**Detection Of Breast cancer using machine learning**

##Problem Statement

Given a list of features (i.e., feature vectors) calculated from a digitized image of the FNA of a breast mass from a patient, the problem is how to determine whether or not the patient has breast cancer.

##Import Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
from scipy.special import boxcox1p

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler,StandardScaler
from sklearn.model_selection import KFold, cross_val_score, ShuffleSplit,cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.svm import SVC

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostClassifier,ExtraTreesClassifier,BaggingClassifier,VotingClassifier
from sklearn.naive_bayes import GaussianNB 

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

In [None]:
from google.colab import drive
drive.mount('/content/drive')

##Import Dataset

In [None]:

df = pd.read_csv('/content/drive/MyDrive/data.csv' ,error_bad_lines=False)
df['diagnosis'] = df['diagnosis'].apply(lambda x : 1 if x == 'M' else 0)
df.drop(['Unnamed: 32','id'],axis = 1,inplace = True)

df.head()

##Getting information about featurese

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

##Feature selection

In [None]:
corrmat = df.corr()
top_corr_features = corrmat.index[abs(corrmat["diagnosis"])>0.7]
plt.figure(figsize=(10,10))
g = sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
correlated_col_with_dignosis = ['radius_mean','perimeter_mean','area_mean','concave points_mean','radius_worst',
                                'perimeter_worst','area_worst','concave points_worst']

for col in correlated_col_with_dignosis:
    fig = plt.figure()
    plt.hist(df[df['diagnosis'] == 0][col],bins=30,fc = (1,0,0,0.5),label='B')
    plt.hist(df[df['diagnosis'] == 1][col],bins=30,fc = (0,1,0,0.5),label = 'M')
    plt.legend()
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.title(col)
    plt.show()

In [None]:
del_bin = []
for i in corrmat.columns:
    for j in corrmat.index:
        if i!= 'diagnosis' and j!='diagnosis':
            if corrmat[j][i] >= 0.90 and corrmat[j][i] < 1.0:
                if corrmat['diagnosis'][i] >= corrmat['diagnosis'][j]:
                    del_bin.append(j)
                else:
                    del_bin.append(i)
                    
del_bin = list(set(del_bin))

In [None]:
df.drop(del_bin,axis = 1,inplace = True)

In [None]:
corrmat1 = df.corr()
top_corr_features = corrmat1.index[abs(corrmat1["diagnosis"])>0.5]
plt.figure(figsize=(10,10))
g = sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
correlated_col_with_dignosis_ = ['concave points_worst','perimeter_worst']
for col in correlated_col_with_dignosis_:
    f1 = plt.figure()
    sns.boxplot(df['diagnosis'],df[col])

In [None]:
df.drop(df[df['diagnosis'] == 0][(df['perimeter_worst']>125) | (df['perimeter_worst']<53)].index,inplace = True)
df.drop(df[df['diagnosis'] == 1][df['perimeter_worst']>215].index,inplace = True)
df.drop(df[df['diagnosis'] == 0][df['concave points_worst']>0.17].index,inplace = True)
df.drop(df[df['diagnosis'] == 1][df['concave points_worst']<0.07].index,inplace = True)

In [None]:
for col in correlated_col_with_dignosis_:
    f1 = plt.figure()
    sns.boxplot(df['diagnosis'],df[col])

##Feature scaling

In [None]:
x = df.drop('diagnosis',axis = 1)
y = df['diagnosis']

sc = StandardScaler()
x = pd.DataFrame(sc.fit_transform(x),columns = x.columns)

x_train,x_test,y_train,y_test = train_test_split(x,y)

##Validation

In [None]:

ss = ShuffleSplit(n_splits = 3, test_size = .3, train_size = .7, random_state = 0)

def acc(model):
    cvs = (cross_validate(model,x.values,y.values,cv = ss,return_train_score = True))
    return cvs['train_score'].mean(),cvs['test_score'].mean()

In [None]:
cf1 = XGBClassifier()
cf2 = AdaBoostClassifier()
cf3 = GradientBoostingClassifier()
cf4 = RandomForestClassifier()
cf5 = ExtraTreesClassifier()
cf6 = BaggingClassifier()
cf=[cf1,cf2,cf3,cf4,cf5,cf6]
clf = [('xgb',cf1),('ada',cf2),('gbm',cf3),('rf',cf4),('et',cf5),('bbc',cf6)]

In [None]:
ens_hard = VotingClassifier(estimators=clf,voting='hard')
ens_soft = VotingClassifier(estimators=clf,voting='soft')
ens = VotingClassifier(estimators = [('ensh',ens_hard),('enss',ens_soft)],voting = 'hard')

In [None]:
ax = sns.countplot(y,label="Count")      
B, M = y.value_counts()
print('Number of Benign: ',B)
print('Number of Malignant : ',M)

##Traing model

In [None]:
cf.append(ens)
for i in cf:
  i.fit(x_train.values,y_train.values)
  print(i,acc(i))


In [None]:
confusion_matrix(y_test,ens.predict(x_test.values))

In [None]:
print(classification_report(y_test,ens.predict(x_test.values)))

In [None]:
acc(ens)