# Import libraries

In [23]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

# Load data in dataframe

In [24]:
df = pd.read_csv("breast-cancer-wisconsin.data")

# First 10 rows

In [25]:
df.head(10)

Unnamed: 0,sample_code_number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4
6,1018099,1,1,1,1,2,10,3,1,1,2
7,1018561,2,1,2,1,2,1,3,1,1,2
8,1033078,2,1,1,1,2,1,1,1,5,2
9,1033078,4,2,1,1,2,1,2,1,1,2


# Data shape

In [26]:
df.shape

(699, 11)

# 458 benign (class 2) tumors and 241 malignant (class 4)

In [27]:
df['class'].value_counts()

2    458
4    241
Name: class, dtype: int64

# Data types, we observe that bare_nuclei is of type object and not int

In [28]:
df.dtypes

sample_code_number              int64
clump_thickness                 int64
uniformity_of_cell_size         int64
uniformity_of_cell_shape        int64
marginal_adhesion               int64
single_epithelial_cell_size     int64
bare_nuclei                    object
bland_chromatin                 int64
normal_nucleoli                 int64
mitoses                         int64
class                           int64
dtype: object

# We check if there are some null values

In [29]:
df.isna().sum()

sample_code_number             0
clump_thickness                0
uniformity_of_cell_size        0
uniformity_of_cell_shape       0
marginal_adhesion              0
single_epithelial_cell_size    0
bare_nuclei                    0
bland_chromatin                0
normal_nucleoli                0
mitoses                        0
class                          0
dtype: int64

In [30]:
df.isnull().sum()

sample_code_number             0
clump_thickness                0
uniformity_of_cell_size        0
uniformity_of_cell_shape       0
marginal_adhesion              0
single_epithelial_cell_size    0
bare_nuclei                    0
bland_chromatin                0
normal_nucleoli                0
mitoses                        0
class                          0
dtype: int64

# We look at why bare_nuclei is not of type int

In [31]:
df['bare_nuclei'].value_counts()

1     402
10    132
5      30
2      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: bare_nuclei, dtype: int64

# We can see that there are 16 samples with a ? value for bare_nuclei, we remove these samples and change type of bare_nuclei to int

In [32]:
indexMissing = df[ df['bare_nuclei'] == '?' ].index
df.drop(indexMissing , inplace=True)
df['bare_nuclei'] = df['bare_nuclei'].astype('int64')

# We remove the sample code number and we change the output class categories : benign is now 0 instead of 2, malignant is now 1 instead of 4

In [33]:
df.drop(['sample_code_number'],axis = 1,inplace = True)
df["class"].replace({2: 0, 4: 1}, inplace=True)

# We removed 16 samples

In [34]:
df.shape

(683, 10)

# We check the changes

In [35]:
df.dtypes

clump_thickness                int64
uniformity_of_cell_size        int64
uniformity_of_cell_shape       int64
marginal_adhesion              int64
single_epithelial_cell_size    int64
bare_nuclei                    int64
bland_chromatin                int64
normal_nucleoli                int64
mitoses                        int64
class                          int64
dtype: object

# New data after these changes

In [36]:
df.head(5)

Unnamed: 0,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0


# Chi2 statistical test for independence

In [37]:
from scipy.stats import chi2_contingency

# We test the independence between each features and the output feature (class)

In [38]:
crosstab = pd.crosstab(df['clump_thickness'],df['class'])
chi2_contingency(crosstab)

(378.0815784919471,
 6.471439515189326e-76,
 9,
 array([[90.3601757 , 48.6398243 ],
        [32.50366032, 17.49633968],
        [67.60761347, 36.39238653],
        [51.35578331, 27.64421669],
        [83.20937042, 44.79062958],
        [21.45241581, 11.54758419],
        [14.95168375,  8.04831625],
        [28.60322108, 15.39677892],
        [ 9.10102489,  4.89897511],
        [44.85505124, 24.14494876]]))

# We observe that more than 80% of the expected frequencies for each cells (in the array) are > 5 and that none is < 1 so we can trust the obtained results 

In [39]:
crosstab = pd.crosstab(df['uniformity_of_cell_size'],df['class'])
chi2_contingency(crosstab)

(539.7930796626143,
 1.7163897109876368e-110,
 9,
 array([[242.477306  , 130.522694  ],
        [ 29.25329429,  15.74670571],
        [ 33.80380673,  18.19619327],
        [ 24.70278184,  13.29721816],
        [ 19.50219619,  10.49780381],
        [ 16.25183016,   8.74816984],
        [ 12.35139092,   6.64860908],
        [ 18.20204978,   9.79795022],
        [  3.90043924,   2.09956076],
        [ 43.55490483,  23.44509517]]))

In [40]:
crosstab = pd.crosstab(df['uniformity_of_cell_shape'],df['class'])
chi2_contingency(crosstab)

(523.0709703815801,
 6.578447629313875e-107,
 9,
 array([[224.92532943, 121.07467057],
        [ 37.70424597,  20.29575403],
        [ 34.45387994,  18.54612006],
        [ 27.95314788,  15.04685212],
        [ 20.80234261,  11.19765739],
        [ 18.85212299,  10.14787701],
        [ 19.50219619,  10.49780381],
        [ 17.55197657,   9.44802343],
        [  4.55051245,   2.44948755],
        [ 37.70424597,  20.29575403]]))

In [41]:
crosstab = pd.crosstab(df['marginal_adhesion'],df['class'])
chi2_contingency(crosstab)

(390.0594997703958,
 1.80795747026511e-78,
 9,
 array([[255.47877013, 137.52122987],
        [ 37.70424597,  20.29575403],
        [ 37.70424597,  20.29575403],
        [ 21.45241581,  11.54758419],
        [ 14.95168375,   8.04831625],
        [ 13.65153734,   7.34846266],
        [  8.45095168,   4.54904832],
        [ 16.25183016,   8.74816984],
        [  2.60029283,   1.39970717],
        [ 35.75402635,  19.24597365]]))

In [42]:
crosstab = pd.crosstab(df['single_epithelial_cell_size'],df['class'])
chi2_contingency(crosstab)

(447.86117522673453,
 8.217595317928255e-91,
 9,
 array([[ 28.60322108,  15.39677892],
        [244.42752562, 131.57247438],
        [ 46.15519766,  24.84480234],
        [ 31.20351391,  16.79648609],
        [ 25.35285505,  13.64714495],
        [ 26.00292826,  13.99707174],
        [  7.15080527,   3.84919473],
        [ 13.65153734,   7.34846266],
        [  1.30014641,   0.69985359],
        [ 20.1522694 ,  10.8477306 ]]))

In [43]:
crosstab = pd.crosstab(df['bare_nuclei'],df['class'])
chi2_contingency(crosstab)

(489.00953068739193,
 1.295766516658567e-99,
 9,
 array([[261.32942899, 140.67057101],
        [ 19.50219619,  10.49780381],
        [ 18.20204978,   9.79795022],
        [ 12.35139092,   6.64860908],
        [ 19.50219619,  10.49780381],
        [  2.60029283,   1.39970717],
        [  5.20058565,   2.79941435],
        [ 13.65153734,   7.34846266],
        [  5.85065886,   3.14934114],
        [ 85.80966325,  46.19033675]]))

In [44]:
crosstab = pd.crosstab(df['bland_chromatin'],df['class'])
chi2_contingency(crosstab)

(453.2097146977555,
 5.9059369624123696e-92,
 9,
 array([[ 97.51098097,  52.48901903],
        [104.01171303,  55.98828697],
        [104.66178624,  56.33821376],
        [ 25.35285505,  13.64714495],
        [ 22.10248902,  11.89751098],
        [  5.85065886,   3.14934114],
        [ 46.15519766,  24.84480234],
        [ 18.20204978,   9.79795022],
        [  7.15080527,   3.84919473],
        [ 13.00146413,   6.99853587]]))

In [45]:
crosstab = pd.crosstab(df['bland_chromatin'],df['class'])
chi2_contingency(crosstab)

(453.2097146977555,
 5.9059369624123696e-92,
 9,
 array([[ 97.51098097,  52.48901903],
        [104.01171303,  55.98828697],
        [104.66178624,  56.33821376],
        [ 25.35285505,  13.64714495],
        [ 22.10248902,  11.89751098],
        [  5.85065886,   3.14934114],
        [ 46.15519766,  24.84480234],
        [ 18.20204978,   9.79795022],
        [  7.15080527,   3.84919473],
        [ 13.00146413,   6.99853587]]))

In [46]:
crosstab = pd.crosstab(df['normal_nucleoli'],df['class'])
chi2_contingency(crosstab)

(416.63060930531464,
 3.8638072907880614e-84,
 9,
 array([[280.83162518, 151.16837482],
        [ 23.40263543,  12.59736457],
        [ 27.30307467,  14.69692533],
        [ 11.70131772,   6.29868228],
        [ 12.35139092,   6.64860908],
        [ 14.30161054,   7.69838946],
        [ 10.4011713 ,   5.5988287 ],
        [ 14.95168375,   8.04831625],
        [  9.7510981 ,   5.2489019 ],
        [ 39.00439239,  20.99560761]]))

In [47]:
crosstab = pd.crosstab(df['mitoses'],df['class'])
chi2_contingency(crosstab)

(191.96819744862034,
 3.138523415624697e-37,
 8,
 array([[365.99121523, 197.00878477],
        [ 22.75256223,  12.24743777],
        [ 21.45241581,  11.54758419],
        [  7.80087848,   4.19912152],
        [  3.90043924,   2.09956076],
        [  1.95021962,   1.04978038],
        [  5.85065886,   3.14934114],
        [  5.20058565,   2.79941435],
        [  9.10102489,   4.89897511]]))

# All the p-values are << 0.05 so we can reject the null hypothesis and conclude that there is a statistically significant association between each feature and the output target class

# To conclude, all the features are useful for predicting the output target class so we do not need to remove some features