In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff

In [4]:
data = pd.read_csv(r'C:\Users\theja\Downloads\Breast Cancer Prediction.csv')

In [5]:
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [6]:
data['Class'].unique()

array([2, 4], dtype=int64)

In [7]:
#We need to check if there are any missing values in the dataset that we have loaded
#This phase is called the data cleansing phase
null_feat = pd.DataFrame(len(data['Sample code number']) - data.isnull().sum(), columns = ['Count'])

trace = go.Bar(x = null_feat.index, y = null_feat['Count'] ,opacity = 0.8, marker=dict(color = 'lightgrey',
        line=dict(color='#000000',width=1.5)))

layout = dict(title =  "Missing Values")
                    
fig = dict(data = [trace], layout=layout)
py.iplot(fig)

In [8]:
#Since there are no missing values, we can continue with the next step 
print(len(data))

683


In [9]:
data.drop(['Sample code number'], axis = 1, inplace = True)    #Dropping the unwanted feature

In [10]:
#Here we are visualizing the dataset a bit before getting into modelling

In [11]:
data.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [12]:
data.describe()   #This describes all the statistical variables like mean, standard deviation for each of the attribute

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
count,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0
mean,4.442167,3.150805,3.215227,2.830161,3.234261,3.544656,3.445095,2.869693,1.603221,2.699854
std,2.820761,3.065145,2.988581,2.864562,2.223085,3.643857,2.449697,3.052666,1.732674,0.954592
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [23]:
M, B = 0, 0
for i in data['Class']:
    if i == 2:
        B += 1
    else:
        M += 1

In [24]:
print(M, B)

239 444


In [25]:
trace = go.Bar(x = (M, B), y = ['malignant', 'benign'], orientation = 'h', opacity = 0.8, marker=dict(
        color=[ 'gold', 'lightskyblue'],
        line=dict(color='#000000',width=1.5)))

layout = dict(title =  'Count of diagnosis variable')
                    
fig = dict(data = [trace], layout=layout)
py.iplot(fig)

trace = go.Pie(labels = ['benign','malignant'], values = [B, M], 
               textfont=dict(size=15), opacity = 0.8,
               marker=dict(colors=['lightskyblue', 'gold'], 
                           line=dict(color='#000000', width=1.5)))


layout = dict(title =  'Distribution of diagnosis variable')
           
fig = dict(data = [trace], layout=layout)
py.iplot(fig)

In [26]:
#Working on correlation matrix to check the relation between each attributes

In [27]:
correlation = data.corr()
matrix_cols = correlation.columns.tolist()
corr_array  = np.array(correlation)

In [28]:
trace = go.Heatmap(z = corr_array,
                   x = matrix_cols,
                   y = matrix_cols,
                   xgap = 2,
                   ygap = 2,
                   colorscale='Viridis',
                   colorbar   = dict() ,
                  )
layout = go.Layout(dict(title = 'Correlation Matrix for variables',
                        autosize = False,
                        height  = 720,
                        width   = 800,
                        margin  = dict(r = 0 ,l = 210,
                                       t = 25,b = 210,
                                     ),
                        yaxis   = dict(tickfont = dict(size = 9)),
                        xaxis   = dict(tickfont = dict(size = 9)),
                       )
                  )
fig = go.Figure(data = [trace],layout = layout)
py.iplot(fig)

In [40]:
#MinMaxNormalization

In [101]:
x = data.drop('Class', axis = 1)
y = data['Class']

In [105]:
from sklearn.preprocessing import MinMaxScaler

In [151]:
scaler = MinMaxScaler()

In [152]:
scaled_data = scaler.fit_transform(x)

In [153]:
from sklearn.model_selection import train_test_split

In [154]:
xtrain, xtest, ytrain, ytest = train_test_split(scaled_data, y, test_size = 0.2)

In [155]:
from sklearn import linear_model

In [156]:
log_clf = linear_model.LogisticRegression()

In [157]:
log_clf.fit(xtrain, ytrain)





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [158]:
preds = log_clf.predict(xtest)

In [159]:
type(ytest)

pandas.core.series.Series

In [160]:
from sklearn.metrics import confusion_matrix, r2_score

In [161]:
confusion_matrix(preds, ytest)

array([[90,  3],
       [ 1, 43]], dtype=int64)

In [162]:
r2_score(preds, ytest)

0.8660801564027371

In [182]:
log_clf.predict(np.array([5, 1, 1, 1, 2, 1, 3, 1, 1]).reshape(1, -1))

array([2], dtype=int64)