In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


Dataset link : https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time

In [None]:
data = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv") 

### EDA

In [None]:
data.head()

columns with unique values have no relation with the output , so better to drop it. e.g 'id' here

columns which are 'unnamed' with values'Nan' can also be dropped e.g 'Unnamed: 32' here

In [None]:
col = data.columns
print(col)

To separate the features from the output labels

In [None]:
y = data.diagnosis
drop_cols = ['Unnamed: 32', 'id','diagnosis']
x = data.drop(drop_cols , axis =1) # axis =0 is for rows 
x.head()

#### PLOT DIAGNOSIS DRISTRIBUTION

In [None]:
#check class imbalance issue
#seaborn count function 

ax = sns.countplot(y , label ='Count')
B, M = y.value_counts()
print('Number of Benign Tumors : ',B)
print('Number of Malingnant Tumors : ',M)

In [None]:
#check statistical values for features, by check it we can depict if standarization/normalization is required or not.
x.describe()

#### VISUALIZAING STANDARDIZED DATA WITH SEABORN

In [None]:
# violin plots , similar to box plot but also shows prob density of the variable

#standarize the data since , it has variations 
data = x
data_std = (data - data.mean())/ data.std()
data = pd.concat([y, data_std.iloc[:,0:10]], axis = 1)
# violin plot, 'data' is in long format which is not supported for violin plot , so we need to melt it
data = pd.melt(data, id_vars ='diagnosis',
              var_name = 'features',  # rest of the features
              value_name = 'value')   # value of each feature 
plt.figure(figsize =(10,10))
sns.violinplot( x= 'features', y='value' , hue ='diagnosis', data=data, split =True, inners='quarts')
plt.xticks(rotation=45)

It can be seen that 'fractal_dim' last plot has same shape for both 'B' and 'M' , so this does not given sufficient value for the diagnosis.

In [None]:
data = pd.concat([y, data_std.iloc[:,10:20]], axis = 1)
# violin plot, 'data' is in long format which is not supported for violin plot , so we need to melt it
data = pd.melt(data, id_vars ='diagnosis',
              var_name = 'features',  # rest of the features
              value_name = 'value')   # value of each feature 
plt.figure(figsize =(10,10))
sns.violinplot( x= 'features', y='value' , hue ='diagnosis', data=data, split =True, inners='quarts')
plt.xticks(rotation=45)

In [None]:
data = pd.concat([y, data_std.iloc[:,20:30]], axis = 1)
# violin plot, 'data' is in long format which is not supported for violin plot , so we need to melt it
data = pd.melt(data, id_vars ='diagnosis',
              var_name = 'features',  # rest of the features
              value_name = 'value')   # value of each feature 
plt.figure(figsize =(10,10))
sns.violinplot( x= 'features', y='value' , hue ='diagnosis', data=data, split =True, inners='quarts')
plt.xticks(rotation=45)

When we have two similar distributions like in above plot 'concave points_worst' and 'concavity_worst' , these can negatively affect the prediction output. We need to handle this issue !

In [None]:
# To check for outliers using Box plot
sns.boxplot(x='features', y='value', hue='diagnosis', data=data)
plt.xticks(rotation=45)

#### JOINT PLOTS FOR FEATURE COMPARISON

When we have two similar distributions like in above plot 'concave points_worst' and 'concavity_worst' , these can negatively affect the prediction output. We need to handle this issue ! -- addressing this by checking the correlation between them

In [None]:
sns.jointplot(x.loc[:, 'concavity_worst'],
             x.loc[:, 'concave points_worst'],
             kind ='regg')

In [None]:
sns.jointplot(x.loc[:, 'concavity_worst'],
             x.loc[:, 'area_worst'],
             kind ='regg')

#### OBSERVING THE DISTRIBUTION OF VALUES AND THEIR VARIANCE USING SWARM PLOTS

swarm plots are affective if the number of data points are small, in this case we have only 600 data points , so swarm plots are more affective in visualizing data than violin plot.

In [None]:
sns.set(style = 'whitegrid', palette ='muted')
data = x
data_std = (data - data.mean())/ data.std()
data = pd.concat([y, data_std.iloc[:,0:10]], axis = 1)
data = pd.melt(data, id_vars ='diagnosis',
              var_name = 'features',  # rest of the features
              value_name = 'value')   # value of each feature 
plt.figure(figsize =(10,10))
sns.swarmplot( x= 'features', y='value' , hue ='diagnosis', data=data)
plt.xticks(rotation=45)

In [None]:
sns.set(style = 'whitegrid', palette ='muted')
data = x
data_std = (data - data.mean())/ data.std()
data = pd.concat([y, data_std.iloc[:,10:20]], axis = 1)
data = pd.melt(data, id_vars ='diagnosis',
              var_name = 'features',  # rest of the features
              value_name = 'value')   # value of each feature 
plt.figure(figsize =(10,10))
sns.swarmplot( x= 'features', y='value' , hue ='diagnosis', data=data)
plt.xticks(rotation=45)

In [None]:
sns.set(style = 'whitegrid', palette ='muted')
data = x
data_std = (data - data.mean())/ data.std()
data = pd.concat([y, data_std.iloc[:,20:30]], axis = 1)
data = pd.melt(data, id_vars ='diagnosis',
              var_name = 'features',  # rest of the features
              value_name = 'value')   # value of each feature 
plt.figure(figsize =(10,10))
sns.swarmplot( x= 'features', y='value' , hue ='diagnosis', data=data)
plt.xticks(rotation=45)

Swarm plots shows that columns where the prediction classes are well separated from each other or not. here 'smoothness_worst' is not well separated all data points are mixed, whereas 'area_worst' is well separated.

#### OBSERVING ALL PAIR-WISE CORRELATION

To check the correlation among all columns, we use heat map.

In [None]:
f, ax = plt.subplots(figsize=(15,15))
sns.heatmap(x.corr() , annot=True, linewidths=1, fmt='.1f', ax=ax)

### FEATURE SELECTION

Method 1: To drop correlated columns from the dataset 

In [None]:
#dropping all columns which have correlation coffecient = 1.0
drop_cols = ['radius_mean','perimeter_mean','area_mean','radius_worst','perimeter_worst','area_worst',
            'compactness_mean', 'compactness_worst','perimeter_se','radius_se','area_se']

df = x.drop(drop_cols, axis = 1)
df.head()

In [None]:
f, ax = plt.subplots(figsize=(15,15))
sns.heatmap(df.corr() , annot=True, linewidths=1, fmt='.1f', ax=ax)

#### CLASSIFICATION USING XGBOOST (minimal feature selection)

In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score


In [None]:
x_train, x_test, y_train, y_test = train_test_split(df , y, test_size = 0.3, random_state=42)
clf_1 = xgb.XGBClassifier(random_state=42)
clf_1 = clf_1.fit(x_train, y_train)

In [None]:
print("Accuracy : ", accuracy_score(y_test, clf_1.predict(x_test)))
cm = confusion_matrix(y_test, clf_1.predict(x_test))
sns.heatmap(cm , annot=True, fmt='d')

Method 2 : Univariate feature selection (selects best k features using chi-sqaure test)

Note: chi-square test calculated the dependencies between the random variable in the dataset. So, this statistic will ignore those features which are independent with target class (features irrelevant for prediction).

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
select_feature = SelectKBest(chi2 , k=10).fit(x_train, y_train)
print("Score List: ", select_feature.scores_)
print("Feature List: ", x_train.columns)

In [None]:
x_train_2 = select_feature.transform(x_train)
x_test_2 = select_feature.transform(x_test)

clf_2 = xgb.XGBClassifier(random_state=42)
clf_2 = clf_1.fit(x_train_2, y_train)

print("Accuracy : ", accuracy_score(y_test, clf_2.predict(x_test_2)))
cm_2 = confusion_matrix(y_test, clf_2.predict(x_test_2))
sns.heatmap(cm_2 , annot=True, fmt='d')

Method 3 : Recursive feature elimiation using cross-validation (this method gives best features but also optimal number of features)

In [None]:
from sklearn.feature_selection import RFECV

clf_3 = xgb.XGBClassifier()
rfecv = RFECV(estimator=clf_3 , step = 1 , cv =5, scoring='accuracy', n_jobs = -1).fit(x_train, y_train)
#'step': the #features eleminated in each step,
# 'cv': #cross-validation

print("Optimal no. of features: ", rfecv.n_features_)
print("Best features: ", x_train.columns[rfecv.support_])

In [None]:
print("Accuracy" , accuracy_score(y_test, rfecv.predict(x_test)))

In [None]:
num_features = [i for i in range(1, len(rfecv.grid_scores_)+1)]
cv_scores= rfecv.grid_scores_
ax = sns.lineplot(x= num_features, y=cv_scores)
ax.set(xlabel = 'No. of selected feature', ylabel ='cv_scores')

Method 4: Using PCA (when dataset has large no. of features, we can reduce the feature space using PCA technique)

NOTE: PCA techniques requires that data to be 0-mean, i.e data should be centered around reference

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=42)

#normalizing data
x_train_norm = (x_train-x_train.mean())/ (x_train.max()- x_train.min())
x_test_norm = (x_test-x_test.mean())/ (x_test.max()- x_test.min())

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(x_train_norm)

plt.figure(figsize=(10,8))
sns.lineplot(data=np.cumsum(pca.explained_variance_ratio_))
plt.xlabel("No. of features")
plt.ylabel("Cumulative explained variance ratio")