#### Importing Required Modules

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import svm

#### Data Preprocessing

In [None]:
cancer = pd.read_excel("C:\\Users\\this pc\\Desktop\\SUMMER\\Cognizance_Machine_Learning\\Intern\\datasetrec.xlsx", sheet_name="Dataset 2")

In [None]:
df = pd.DataFrame(cancer)

In [None]:
df.shape

In [None]:
df=df.drop(columns=' 4 for malignant)')
df.rename(columns={'Class: (2 for benign,  4 for malignant)':'Target'}, inplace=True)

In [None]:
#some values in bare nuclei are '?' changing them
def changeVal(val):
    if(val == '?'):
        return float('nan')
    else: return val
    
def toCorrect(df, feature_name):
        return df[feature_name].apply(changeVal)

In [None]:
df['Bare Nuclei'] = toCorrect(df, 'Bare Nuclei')

In [None]:
print(df['Bare Nuclei'].isna().sum())
df['Bare Nuclei'].fillna(value=df['Bare Nuclei'].mean(),inplace=True)

In [None]:
def oneOrZero(val): 
    if(val == 4):
        return 1
    else:
        return 0
    
def toBool(df, feature_name):
    return df[feature_name].apply(oneOrZero)

In [None]:
df['Target'] = toBool(df, 'Target')

In [None]:
print(df.loc[df.Target == 1, 'Target'].count())
print(df.loc[df.Target == 0, 'Target'].count())##unbalanced data

In [None]:
y = pd.DataFrame(df['Target'])
df.drop(columns = 'Target', inplace=True)

In [None]:
df.drop(columns = 'Sample code number', inplace=True)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df, y, test_size = 0.2)

In [None]:
x_train.shape, x_test.shape

#### Logistic Regression

In [None]:
clf = LogisticRegression()
clf.fit(x_train, y_train)
y_pred1 = clf.predict(x_test)

In [None]:
clf.score(x_test, y_test)

In [None]:
accuracy_score(y_test, y_pred1)

In [None]:
recall_score(y_test, y_pred1)

In [None]:
f1_score(y_test, y_pred1)

In [None]:
mat = confusion_matrix(y_test, y_pred1)

In [None]:
print("Number of False Negatives: ",mat[1][0])
print("Number of False Positives: ",mat[0][1])

#### Random Forest Classifier

In [None]:
clf1 = RandomForestClassifier()

In [None]:
clf1.fit(x_train, y_train)
y_pred2=clf1.predict(x_test)

In [None]:
accuracy_score(y_test, y_pred2)

In [None]:
recall_score(y_test, y_pred2)

In [None]:
f1_score(y_test, y_pred2)

In [None]:
mat = confusion_matrix(y_test, y_pred2)

In [None]:
print("Number of False Negatives: ",mat[1][0])
print("Number of False Positives: ",mat[0][1])

In [None]:
clf1.feature_importances_

#### Gradient Boosting

In [None]:
clf2 = XGBClassifier(max_depth=3, eta = 0.3, gamma = 1, nrounds = 50)
clf2.fit(x_train, y_train)

In [None]:
y_pred3 = clf2.predict(x_test)
y_pred3 = [round(val) for val in y_pred3]

In [None]:
accuracy_score(y_test, y_pred3)

In [None]:
recall_score(y_test, y_pred3)

In [None]:
f1_score(y_test, y_pred3)

In [None]:
mat = confusion_matrix(y_test, y_pred3)

In [None]:
print("Number of False Negatives: ",mat[1][0])
print("Number of False Positives: ",mat[0][1])

#### SVM

In [None]:
clf3 = svm.SVC()
clf3.fit(x_train, y_train)

In [None]:
y_pred4 = clf3.predict(x_test)

In [None]:
accuracy_score(y_test, y_pred4)

In [None]:
recall_score(y_test, y_pred4)

In [None]:
f1_score(y_test, y_pred4)

In [None]:
mat = confusion_matrix(y_test, y_pred4)

In [None]:
print("Number of False Negatives: ",mat[1][0])
print("Number of False Positives: ",mat[0][1])

#### NAIVE BAYES

In [None]:
clf4 = GaussianNB()
clf4.fit(x_train, y_train)

In [None]:
y_pred5 = clf4.predict(x_test)

In [None]:
clf4.score(x_test, y_test)

In [None]:
accuracy_score(y_test, y_pred5)

In [None]:
recall_score(y_test, y_pred5)

In [None]:
f1_score(y_test, y_pred5)

In [None]:
mat = confusion_matrix(y_test, y_pred5)

In [None]:
print("Number of False Negatives: ",mat[1][0])
print("Number of False Positives: ",mat[0][1])

##  ANALYSIS*

Since, the dataset is about predicting breast cancer, therefore it is mandatory to focus on the false negatives as compared to the false positives. To this end, I have concluded that models should have a higher recall value and that is the basis of judgement I will use to judge the suitability of the model/algorithm towards this particular dataset.

#### VISUALIZATION

In [None]:
accuracy_score_y = [accuracy_score(y_test, y_pred1)*100, accuracy_score(y_test, y_pred2)*100, accuracy_score(y_test, y_pred3)*100, accuracy_score(y_test, y_pred4)*100, accuracy_score(y_test, y_pred5)*100]
f1_score_y = [f1_score(y_test, y_pred1)*100, f1_score(y_test, y_pred2)*100, f1_score(y_test, y_pred3)*100, f1_score(y_test, y_pred4)*100, f1_score(y_test, y_pred5)*100]
recall_score_y = [recall_score(y_test, y_pred1)*100, recall_score(y_test, y_pred2)*100, recall_score(y_test, y_pred3)*100, recall_score(y_test, y_pred4)*100, recall_score(y_test, y_pred5)*100]

In [None]:
x_labels = ['LR', 'RF', 'GB', 'SVM', 'NB']
#LR-Logistic Regression RF-Random Forest GB-Gradient Boosting SVM-Support Vector Machines NB-Naive Bayes

In [None]:
plt.ylabel('Accuracy Score(%)')
plt.xlabel('Models')
plt.bar(x_labels,accuracy_score_y)
plt.show

In [None]:
plt.ylabel('F1 Score(%)')
plt.xlabel('Models')
plt.bar(x_labels,f1_score_y)
plt.show

In [None]:
plt.ylabel('Recall Score(%)')
plt.xlabel('Models')
plt.bar(x_labels,recall_score_y)
plt.show

In [None]:
plt.ylabel('Feature Importance(As per Gradient Boosting)')
plt.xlabel('Features')
x_label = ['CT', 'UCS', 'UCSh', 'MA', 'SEC', 'BN', 'BC','NN', 'MI']
plt.bar(x_label, clf2.feature_importances_)
plt.show()

In [None]:
'''CT - Clump Thickness
UCS - Uniformity of Cell Size
UCSh - Uniformity of Cell Shape
MA - Marginal Adhesion
SEC - Single Epithelial Cell Size
BN - Bare Nuclei
BC - Bland Chromatin
NN - Normal Nucleoli
MI - Mitoses'''

### Tuning the model further to increase sensitivity towards false  negatives

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer

In [None]:
clf = RandomForestClassifier(n_jobs = -1)


In [None]:
param_grid = {
    'min_samples_split' : [3,5],
    'n_estimators' : [100, 200],
    'max_depth' : [3,5,15],
    'max_features' : [3,5,7]
}

In [None]:
scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}

In [None]:
def grid_search_wrapper(refit_score='recall_score'):
    
    skf = StratifiedKFold(n_splits=10)
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score, cv=skf, return_train_score=True, n_jobs=-1)
    grid_search.fit(x_train, y_train)

    y_pred = grid_search.predict(x_test)

    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score))
    print(pd.DataFrame(confusion_matrix(y_test, y_pred),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
    return grid_search

In [None]:
grid_search_clf = grid_search_wrapper(refit_score='precision_score')

In [None]:
results = pd.DataFrame(grid_search_clf.cv_results_)
results = results.sort_values(by='mean_test_precision_score', ascending=False)
results[['mean_test_precision_score', 'mean_test_recall_score', 'mean_test_accuracy_score', 'param_max_depth', 
         'param_max_features', 'param_min_samples_split', 'param_n_estimators']].round(3).head()

In [None]:
grid_search_clf = grid_search_wrapper(refit_score='recall_score')

In [None]:
results = pd.DataFrame(grid_search_clf.cv_results_)
results = results.sort_values(by='mean_test_precision_score', ascending=False)
results[['mean_test_precision_score', 'mean_test_recall_score', 'mean_test_accuracy_score', 'param_max_depth', 
         'param_max_features', 'param_min_samples_split', 'param_n_estimators']].round(3).head()

In [None]:
#The results in both cases have been almost the same.So, we will have to use precision_recall_curve 
#and roc_cureve to analyse better what the operating point should be

In [None]:
y_scores = grid_search_clf.predict_proba(x_test)[:, 1]

In [None]:
p, r, thresholds = precision_recall_curve(y_test, y_scores)

In [None]:
p, r, thresholds

In [None]:
def adjusted_classes(y_scores, t):
    return [1 if y >= t else 0 for y in y_scores]#works for binary classification only

def precision_recall_threshold(p, r, thresholds, t=0.5):
    y_pred_adj = adjusted_classes(y_scores, t)
    mat = confusion_matrix(y_test, y_pred_adj)
    print("Number of False Negatives: ",mat[1][0])
    print("Number of False Positives: ",mat[0][1])

In [None]:
precision_recall_threshold(p, r, thresholds, 0.30)

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.figure(figsize=(8, 8))
    plt.title("Precision and Recall Scores as a function of the decision threshold")
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.ylabel("Score")
    plt.xlabel("Decision Threshold")
    plt.legend(loc='best')

In [None]:
plot_precision_recall_vs_threshold(p, r, thresholds)