In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from pandas.plotting import scatter_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import naive_bayes
from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, precision_score, recall_score, roc_curve

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
datos = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")
datos

In [None]:
len(datos.id.unique())

In [None]:
datos.info()

In [None]:
datos.isnull().sum()

In [None]:
datos.drop('Unnamed: 32', axis = 1, inplace = True)

In [None]:
datos.drop('id', axis = 1, inplace = True)

In [None]:
datos.describe()

In [None]:
datos2 = datos.iloc[:,0:31]
plt.figure(figsize=(18,9))
sns.heatmap(datos2.corr(),annot = True, cmap ="Accent_r")

In [None]:
print(datos.diagnosis.value_counts())

In [None]:
datos['diagnosis'] = (datos['diagnosis'] == 'M' ).astype(int)

In [None]:
print(datos.diagnosis.value_counts())

In [None]:
corr = abs(datos.corr())
corr[['diagnosis']].sort_values(by = 'diagnosis',ascending = False).style.background_gradient()

In [None]:
datos.drop('smoothness_mean', axis = 1, inplace = True)
datos.drop('symmetry_mean', axis = 1, inplace = True)
datos.drop('fractal_dimension_worst', axis = 1, inplace = True)
datos.drop('compactness_se', axis = 1, inplace = True)
datos.drop('concavity_se', axis = 1, inplace = True)
datos.drop('fractal_dimension_se', axis = 1, inplace = True)
datos.drop('smoothness_se', axis = 1, inplace = True)
datos.drop('fractal_dimension_mean', axis = 1, inplace = True)
datos.drop('texture_se', axis = 1, inplace = True)
datos.drop('symmetry_se', axis = 1, inplace = True)

In [None]:
lista_variables=['concave points_worst','perimeter_worst','concave points_mean', 'radius_worst', 'perimeter_mean','area_worst',
                 'radius_mean','area_mean','concavity_mean','concavity_worst','compactness_mean','compactness_worst',
                 'radius_se','perimeter_se','area_se','texture_worst','smoothness_worst','symmetry_worst','texture_mean',
                 'concave points_se']

for l in lista_variables:
    plt.hist(datos[l])
    plt.title(l)
    plt.show()

In [None]:
# Ahora la variable target se va a guardar en la variable "y"
y = datos['diagnosis']
datos.drop('diagnosis', axis=1, inplace=True)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(datos, y, test_size=0.2, random_state=42)

In [None]:
def saca_metricas(y1, y2):
    print('matriz de confusión')
    print(confusion_matrix(y1, y2))
    print('exactitud')
    print(accuracy_score(y1, y2))
    print('precision')
    print(precision_score(y1, y2))
    print('sensibilidad')
    print(recall_score(y1, y2))
    print('f1')
    print(f1_score(y1, y2))
    false_positive_rate, recall, thresholds = roc_curve(y1, y2)
    roc_auc = auc(false_positive_rate, recall)
    print('AUC')
    print(roc_auc)
    plt.plot(false_positive_rate, recall, 'b')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.title('AUC = %0.2f' % roc_auc)

Logistic Regression

In [None]:
from sklearn.pipeline import make_pipeline

classifier = make_pipeline(StandardScaler(), LogisticRegression())
classifier.fit(x_train, y_train)

In [None]:
y_pred  = classifier.predict(x_test)

In [None]:
saca_metricas(y_test, y_pred)

Decision Tree

In [None]:
dt_classifier = DecisionTreeClassifier().fit(x_train, y_train)
y_pred        = dt_classifier.predict(x_test)

In [None]:
saca_metricas(y_test, y_pred)

In [None]:
dt_classifier = DecisionTreeClassifier(max_depth = 4).fit(x_train, y_train)
y_pred        = dt_classifier.predict(x_test)
saca_metricas(y_test, y_pred)

Random Forest

In [None]:
rf_classifier = RandomForestClassifier().fit(x_train, y_train)
y_pred        = rf_classifier.predict(x_test)
saca_metricas(y_test, y_pred)

In [None]:
dt_classifier = RandomForestClassifier(max_depth = 5).fit(x_train, y_train)
y_pred        = dt_classifier.predict(x_test)
saca_metricas(y_test, y_pred)

SVM

In [None]:
svm_classifier = SVC().fit(x_train, y_train)
y_pred         = svm_classifier.predict(x_test)

saca_metricas(y_test, y_pred)

Naive Bayes

In [None]:
nb_classifier = naive_bayes.GaussianNB().fit(x_train, y_train)
y_pred        = nb_classifier.predict(x_test)

saca_metricas(y_test, y_pred)

Grid Search

In [None]:
grid_param = {
    'n_estimators': [100, 300, 500, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

In [None]:
model_grid = GridSearchCV(estimator=RandomForestClassifier(),
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5, 
                     n_jobs=-1)

In [None]:
model_grid.fit(datos, y)

In [None]:
print(model_grid.best_params_)

In [None]:
dt_classifier_grid = RandomForestClassifier(bootstrap = True, criterion = 'entropy', n_estimators = 800).fit(x_train, y_train)
y_pred        = dt_classifier_grid.predict(x_test)
saca_metricas(y_test, y_pred)

In [None]:
print(model_grid.best_score_)

Cross Validation test on the finalist models:

In [None]:
# Logistic Regression
results = cross_val_score(estimator=classifier, X=x_train, y=y_train, cv=5)

In [None]:
results

In [None]:
print(results.mean())
print(results.std())

In [None]:
# Random Forest
results = cross_val_score(estimator=dt_classifier, X=x_train, y=y_train, cv=5)

In [None]:
results

In [None]:
print(results.mean())
print(results.std())

In [None]:
# Grid Search - Random Fores
results = cross_val_score(estimator=dt_classifier_grid, X=x_train, y=y_train, cv=5)

In [None]:
results

In [None]:
print(results.mean())
print(results.std())

Winning model predictions

In [None]:
modelo_final = RandomForestClassifier(bootstrap = True, criterion = 'entropy', n_estimators = 800)
modelo_final.fit(datos, y)

In [None]:
predicciones_final = modelo_final.predict(datos)

In [None]:
predicciones = pd.concat([pd.Series(predicciones_final, name = 'diagnosis')], axis=1)
predicciones = predicciones.replace({'diagnosis': {1 : 'maligno', 
                                        0 : 'benigno'}})
predicciones