# EXPLORATORY DATA ANALYSIS

In [11]:
# import libraries
import pandas as pd
from pandas_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import numpy as np

plt.style.use('ggplot')
%matplotlib inline

In [2]:
# load data
file = 'data/cancer.data'
df = pd.read_csv(file, header=None)

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [3]:
# summary statistics
df_description = df.describe()
print(df_description)

print('\n')

# DataFrame information
df_info = df.info()
print(df_info)

print('\n')

# inspect missing values
df.isnull().any()

                 0           1           2           3           4   \
count  6.990000e+02  699.000000  699.000000  699.000000  699.000000   
mean   1.071704e+06    4.417740    3.134478    3.207439    2.806867   
std    6.170957e+05    2.815741    3.051459    2.971913    2.855379   
min    6.163400e+04    1.000000    1.000000    1.000000    1.000000   
25%    8.706885e+05    2.000000    1.000000    1.000000    1.000000   
50%    1.171710e+06    4.000000    1.000000    1.000000    1.000000   
75%    1.238298e+06    6.000000    5.000000    5.000000    4.000000   
max    1.345435e+07   10.000000   10.000000   10.000000   10.000000   

               5           7           8           9           10  
count  699.000000  699.000000  699.000000  699.000000  699.000000  
mean     3.216023    3.437768    2.866953    1.589413    2.689557  
std      2.214300    2.438364    3.053634    1.715078    0.951273  
min      1.000000    1.000000    1.000000    1.000000    2.000000  
25%      2.000000   

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
dtype: bool

In [4]:
# Replace the '?'s with NaN
df = df.replace('?', np.nan)

In [5]:
# impute missing values with mean imputation
df.fillna(df.mean(), inplace=True)

# count the number of NANs to verify
df.isnull().sum()

0      0
1      0
2      0
3      0
4      0
5      0
6     16
7      0
8      0
9      0
10     0
dtype: int64

In [6]:
for col in df:
    if df[col].dtype == 'object':
        df = df.fillna(df[col].value_counts().index[0])
        
print(df.isnull().sum())

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
dtype: int64


> It appears there are no missing values in the data set.

> Column names need to be renamed appropriatley.


In [7]:
# rename columns
df.rename(columns = {0:'sample_code_number', 1:'clump_thickness', 2:'uniformity_cell_size', 
                    3:'uniformity_cell_shape', 4:'marginal_adhesion', 5:'single_epithelial_cell_size', 
                    6:'bare_nuclei', 7:'bland_chromatin', 8:'normal_nucleoli', 9:'mitoses', 10:'class'}, inplace = True) 

> Sample Code Number is not a neccesssary feature in predicting breast cancer, therefore it will need to be dropped.

In [8]:
# drop sample_code_number feature and convert dataframe to numpy array
df = df.drop(['sample_code_number'], axis=1)

# segregate features and lables into separate variables
X = df[['clump_thickness', 'uniformity_cell_size', 'uniformity_cell_shape', 'marginal_adhesion',
       'single_epithelial_cell_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitoses']]
y = df[['class']]

# split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

print(y_train.shape, X_train.shape)
print(y_test.shape, X_test.shape)

(489, 1) (489, 9)
(210, 1) (210, 9)


# Preprocessing Data

In [9]:
# instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0,1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.fit_transform(X_test)

# Fit Model to Train Set

In [10]:
logreg = LogisticRegression()

logreg.fit(rescaledX_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

# Make Predictions and Evaluate Performance

In [12]:
# predict instances for test
y_pred = logreg.predict(rescaledX_test)

# accuracy score for logreg model
print('Accuracy of logistic regression classifier: ', logreg.score(rescaledX_test, y_test))

# confusion matrix of the model
print(confusion_matrix(y_test, y_pred))

Accuracy of logistic regression classifier:  0.9666666666666667
[[141   2]
 [  5  62]]
