## Logistic Regression

### Importing the libraries

In [74]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

### Importing the dataset

In [75]:
df = pd.read_csv('breast_cancer.csv')
X = df.iloc[: , 1:-1].values  #excluded the sample code number column as no impact on class(dependent variable)
y = df.iloc[: , -1].values

In [76]:
print(df)

     Sample code number  Clump Thickness  Uniformity of Cell Size  \
0               1000025                5                        1   
1               1002945                5                        4   
2               1015425                3                        1   
3               1016277                6                        8   
4               1017023                4                        1   
..                  ...              ...                      ...   
678              776715                3                        1   
679              841769                2                        1   
680              888820                5                       10   
681              897471                4                        8   
682              897471                4                        8   

     Uniformity of Cell Shape  Marginal Adhesion  Single Epithelial Cell Size  \
0                           1                  1                            2   
1        

In [77]:
y.shape

(683,)

In [78]:
X.shape

(683, 9)

## Splitting the dataset into the Training and Test set

In [79]:
X_train , X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2 , random_state = 0)

## Training the Logistic Regression model on Training set

In [80]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [81]:
y_pred = classifier.predict(X_test)
#print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [82]:
comparison_df = pd.DataFrame({
    'Predicted': y_pred,
    'Actual': y_test
})
print(comparison_df.head())

   Predicted  Actual
0          2       2
1          2       2
2          4       4
3          4       4
4          2       2


In [83]:
def highlight_wrong(row):
    if row['Predicted'] != row['Actual']:
        return ['background-color: salmon'] * len(row)
    else:
        return [''] * len(row)

comparison_df.style.apply(highlight_wrong, axis=1)


Unnamed: 0,Predicted,Actual
0,2,2
1,2,2
2,4,4
3,4,4
4,2,2
5,2,2
6,2,2
7,4,4
8,2,2
9,2,2


In [84]:
comparison_df[comparison_df['Predicted'] != comparison_df['Actual']]


Unnamed: 0,Predicted,Actual
57,4,2
87,4,2
94,2,4
95,4,2
107,2,4
117,2,4


## Making the Confusion Matrix

In [85]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[84  3]
 [ 3 47]]


0.9562043795620438

## Accuracy with k-fold two approaches

### Computing the accuracy with k-Fold Cross Validatin

In [86]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(classifier, X_train, y_train , cv = 10)
print("Accuracy score for each fold: ",accuracies)
#print("Average Accuracy : {:.2f} % ".format(accuracies.mean() * 100))
print(f"Average accuracy: {accuracies.mean() * 100:.2f}%")
print(f"Average Standard Deviation: { accuracies.std() * 100:.2f}%")

Accuracy score for each fold:  [0.94545455 0.96363636 0.96363636 1.         0.94545455 1.
 0.96296296 0.96296296 0.98148148 0.94444444]
Average accuracy: 96.70%
Average Standard Deviation: 1.97%


### Manual k-fold with loop

In [87]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=10)
accuracies = []

for train_idx, test_idx in kf.split(X_train):
    X_tr, X_val = X_train[train_idx], X_train[test_idx]
    y_tr, y_val = y_train[train_idx], y_train[test_idx]
    
    classifier.fit(X_tr, y_tr)
    y_pred = classifier.predict(X_val)
    accuracies.append(accuracy_score(y_val, y_pred))

print("Average Accuracy: {:.2f} %".format(sum(accuracies) / len(accuracies) * 100))


Average Accuracy: 96.70 %
