# Logistic Regression

## Importing the libraries

In [0]:
import pandas as pd
import numpy as np

## Importing the dataset

In [0]:
dataset = pd.read_csv("breast_cancer.csv")

###Missing Data Handling

In [64]:
dataset.replace({"?":np.nan},inplace=True)
nulls = dataset.isnull().sum().sum()
print("Percentage of null values containing rows: ",(nulls/len(dataset))*100)
#we drop these rows as they are very few in number
dataset.dropna(axis=0, inplace= True)
nulls = dataset.isnull().sum().sum()
print("Nulls after handling missing data: ",nulls)
X = dataset.iloc[:,1:-1]
y = dataset.iloc[:,-1]

Percentage of null values containing rows:  2.28898426323319
Nulls after handling missing data:  0


## Splitting the dataset into the Training set and Test set

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Logistic Regression model on the Training set

In [66]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Predicting the Test set results

In [67]:
y_pred = classifier.predict(X_test)
print(y_pred[:25])
print(y_test[:25])

[2 2 4 4 2 2 2 4 2 2 4 2 4 2 2 2 4 4 4 2 2 2 4 2 4]
115    2
392    2
316    4
519    4
313    2
505    2
77     2
506    4
485    2
573    2
14     4
161    2
187    4
533    2
584    2
148    2
237    4
253    4
344    4
70     2
199    2
81     2
146    4
554    2
329    4
Name: Class, dtype: int64


## Making the Confusion Matrix

In [68]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_pred, y_test)
score = accuracy_score(y_pred, y_test)
print('Confusion Matix:','\n',cm,'\n','Accuracy score: ',score)

Confusion Matix: 
 [[84  3]
 [ 3 47]] 
 Accuracy score:  0.9562043795620438


## Computing the accuracy with k-Fold Cross Validation

In [69]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
for i in range(0,len(accuracies)):
  print("Accuracy score for fold {} : {:.2f} %".format(i+1,accuracies[i]*100))

Accuracy score for fold 1 : 94.55 %
Accuracy score for fold 2 : 96.36 %
Accuracy score for fold 3 : 96.36 %
Accuracy score for fold 4 : 100.00 %
Accuracy score for fold 5 : 94.55 %
Accuracy score for fold 6 : 100.00 %
Accuracy score for fold 7 : 96.30 %
Accuracy score for fold 8 : 96.30 %
Accuracy score for fold 9 : 98.15 %
Accuracy score for fold 10 : 94.44 %


##Final Results

In [70]:
print("Accuracy score: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation between accuracies: {:.2f} %".format(accuracies.std()*100))

Accuracy score: 96.70 %
Standard Deviation between accuracies: 1.97 %
