In [None]:
# Installing dependencies
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install lazypredict

In [None]:
# Importa los modulos
import numpy as np
import pandas as pd
import matplotlib .pyplot
import seaborn as sns

In [None]:
# Installing the kaggle library
! pip install kaggle
#Making a directory 
! mkdir ~/.kaggle
# copy the "kaggle.json" 
!cp kaggle.json ~/.kaggle/
# allocating the required permision for this file
! chmod 600 ~/.kaggle/kaggle.json


In [None]:
#download the dataset
! kaggle datasets download uciml/breast-cancer-wisconsin-data

In [None]:
!unzip /content/breast-cancer-wisconsin-data.zip

In [None]:
df = pd.read_csv('/content/data.csv')

In [None]:
df.head()

In [None]:
# Checking the total rows and columns
df.shape

In [None]:
# Columns and data types
df.info()

In [None]:
#2nd way to check null values
df.isnull().sum()

In [None]:
# Drop the column with all missing values
df = df.dropna(axis=1)

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
# Count values of the column diagnosis 
df['diagnosis'].value_counts()  

In [None]:
# Create a box plot of diagnosis cases
sns.countplot(df['diagnosis'], label = 'count')

In [None]:
# Used for transforming values to 0 and 1 (Y values only)
from sklearn.preprocessing import LabelEncoder
LabelEncoder_Y = LabelEncoder()

In [None]:
# Categorical to Numerical
df.iloc[:,1] = LabelEncoder_Y.fit_transform(df.iloc[:,1].values)

In [None]:
# Displaying column 1 ("diagnosis") values (now 0 and 1)
df.iloc[:,1].values

In [None]:
# Creation of pairplot (correlation of several variables)
sns.pairplot(df.iloc[:,1:7], hue='diagnosis')

In [None]:
# Generation of a Heatmap of the correlation between several variables with the color scheme 'YlGnBu', data annotated and expression in percentage.
sns.heatmap(df.iloc[:,1:11].corr(), cmap= 'YlGnBu', annot=True, fmt= '.0%')

In [None]:
# Feature Scalling 
# Splitting the data Sets in dependent / independent
# Independiente = x
X = df.iloc[:,2:31].values
# Dependiente = y
Y = df.iloc[:,1].values

In [None]:
# Spliting the set 80:20
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 0)

In [None]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_train.shape)

In [None]:
# Preprocesamiento / Standardize features by removing the mean and scaling to unit variance.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
# Evaluate more models using lazypredict
import lazypredict
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
LP_models,predictions = clf.fit(X_train, X_test, Y_train, Y_test)
LP_models

In [None]:
print(LP_models)

In [None]:
#For the creation of the graph to evaluate the models x = model name from models´ variable names
# https://seaborn.pydata.org/tutorial/color_palettes.html para cambiar el color
# https://github.com/dataprofessor/python/blob/main/lazypredict.ipynb

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 5))
sns.set_theme(style='whitegrid')
ax = sns.barplot(x=LP_models.index, y="Accuracy", data=LP_models, palette='flare')
plt.xticks(rotation=90)

In [None]:
# Creation of the three types of model
from sklearn import tree

# Assumes binary yes/no classification response
def models(X_train, Y_train):
  from sklearn.linear_model import LogisticRegression
  log = LogisticRegression(random_state= 0)
  log.fit(X_train, Y_train) 

  from sklearn.tree import DecisionTreeClassifier
  tree = DecisionTreeClassifier(criterion= 'entropy', random_state= 0)
  tree.fit(X_train, Y_train)

  from sklearn.ensemble import RandomForestClassifier
  forest = RandomForestClassifier(n_estimators= 10, criterion= 'entropy', random_state= 0)
  forest.fit(X_train, Y_train)

  print('The accuracy of the Logistic Regression: ', log.score(X_train, Y_train))
  print('The accuracy of the Decision Regression: ', tree.score(X_train, Y_train))
  print('The accuracy of the Random Regression: ', forest.score(X_train, Y_train))

  return log, tree, forest


In [None]:
# Function deployment 
model = models(X_train, Y_train)

In [None]:
# Creation of the confusion matrix, to see how accurate it is at the time of the predictions
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, model[0].predict(X_test))
tp = cm[0][0]
tn = cm[1][1]
fp = cm[0][1]
fn = cm[1][0]
print(cm)
print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))

In [None]:
# Another way of obtaining classification accuracy and score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

for i in range(len(model)):
  print('Model: ',i)
  print(classification_report(Y_test, model[i].predict(X_test)))
  print(accuracy_score(Y_test, model[i].predict(X_test)))
  print()

In [None]:
# Las predicciones del modelo vs las reales
pred = model[2].predict(X_test)
print('Our model prediction: ')
print(pred)
print()
print('Actual prediction: ')
print(Y_test)

In [None]:
for i in range(len(X_test)):
  if pred[i] == Y_test[i]:
    print("Correct")
  else:
    print("False")
