In [None]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix



%matplotlib inline

#  Cancer  Data Set  

### Breast Cancer Wisconsin (Diagnostic) Data Set

**Abstract:** Diagnostic Wisconsin Breast Cancer Database
Source:

Creators:

1. Dr. William H. Wolberg, General Surgery Dept.
University of Wisconsin, Clinical Sciences Center
Madison, WI 53792
wolberg '@' eagle.surgery.wisc.edu

2. W. Nick Street, Computer Sciences Dept.
University of Wisconsin, 1210 West Dayton St., Madison, WI 53706
street '@' cs.wisc.edu 608-262-6619

3. Olvi L. Mangasarian, Computer Sciences Dept.
University of Wisconsin, 1210 West Dayton St., Madison, WI 53706
olvi '@' cs.wisc.edu

### Información del conjunto de datos:

Las características se calculan a partir de una imagen digitalizada de aspiración con aguja fina (PAAF) de una masas mamarias. Describen las características de los núcleos celulares presentes en la imagen (más detalles http://pages.cs.wisc.edu/~street/images/)

Attribute Information:

1. ID number
2. Diagnosis (M = malignant, B = benign) 

**Ten real-valued features are computed for each cell nucleus:**

a) radius (mean of distances from center to points on the perimeter)

b) texture (standard deviation of gray-scale values)

c) perimeter

d) area

e) smoothness (local variation in radius lengths)

f) compactness (perimeter^2 / area - 1.0)

g) concavity (severity of concave portions of the contour)

h) concave points (number of concave portions of the contour)

i) symmetry

j) fractal dimension ("coastline approximation" - 1)

In [None]:
# Cargamos los datos del fichero CSV
cancer_df = pd.read_csv('../../data/06_breast-cancer-wisconsin-data.csv')

# Imprimimos los datos cargados con pandas
#df.describe()
#df = df.set_index('id')
cancer_df.info()

## Preparamos Datos

In [None]:
le = preprocessing.LabelEncoder()
le.fit(cancer_df['diagnosis'])
cancer_df['diagnosis_cod'] = le.transform(cancer_df['diagnosis'])
cancer_df = cancer_df.drop(['Unnamed: 32','id','diagnosis'], axis=1)
cancer_df
#le.inverse_transform(iris_df.species_cod)

In [None]:
# separamos datos en data y target
cancer_data = cancer_df.drop(['diagnosis_cod'], axis=1)
cancer_target = cancer_df.diagnosis_cod

In [None]:
# Separamos los Datos.... Entrenamiento y test
#?  train_test_split()


X_train, X_test, y_train, y_test = train_test_split(cancer_data, cancer_target,
                                                    test_size=0.33,
                                                    random_state=None,
                                                    shuffle =None)

print('Set de datos para Entrenamiento =',len(X_train))
print('Set de datos para Test',len(X_test))
print('Total',len(X_test)+len(X_train))
print('Data Shape=',X_test.shape)
print('Target Shape =',y_test.shape)

X_train.head()
#X_train.columns

# ======== Knn Cancer Data Set =========

In [None]:
# Create two lists for training and test accuracies
training_accuracy = []
test_accuracy = []

# Define a range of 1 to 10 (included) neighbors to be tested
neighbors_settings = range(1,20)

# Loop with the KNN through the different number of neighbors to determine the most appropriate (best)
for n_neighbors in neighbors_settings:
    clf = KNeighborsClassifier(n_neighbors=n_neighbors,
                               algorithm='auto',
                               weights='uniform')
    clf.fit(X_train, y_train)
    training_accuracy.append(clf.score(X_train, y_train))
    test_accuracy.append(clf.score(X_test, y_test))

# Visualize results - to help with deciding which n_neigbors yields the best results (n_neighbors=6, in this case)
plt.plot(neighbors_settings, training_accuracy, label='Accuracy of the training set', marker='o')
plt.plot(neighbors_settings, test_accuracy, label='Accuracy of the test set', marker='o')
plt.ylabel('Accuracy')
plt.xlabel('Number of Neighbors')
plt.legend()


In [None]:
clf = KNeighborsClassifier(n_neighbors=5,weights='uniform', algorithm='auto')
clf.fit(X_train, y_train)
print ("Score with data Tes",clf.score(X_test,y_test))
print ("Score with data Train",clf.score(X_train,y_train))

#  Alguna Predicción 

In [None]:
warnings.filterwarnings('ignore')
ind = 78
print(cancer_data.iloc[ind])
print('specie',cancer_target.iloc[ind], le.inverse_transform(cancer_target.iloc[[ind]]))
x_new = cancer_data.iloc[ind]

print('\n======== PREDICTION ========')
prediction = clf.predict([x_new.values])
prediction_pb = clf.predict_proba([x_new.values])
print('Specie prediction',prediction, le.inverse_transform(prediction))
print('Probability Specie prediction',prediction_pb)

# Clasification Report

In [None]:
p = clf.predict(X_test)

print ('Accuracy:', accuracy_score(y_test, p))
print ('\nConfusion Matrix:\n', confusion_matrix(y_test, p))
print ('\nClassification Report:', classification_report(y_test, p))


Diabetes Data Set

Michael Kahn, MD, PhD, Universidad de Washington, St. Louis, MO
Información del conjunto de datos:

Los registros de pacientes con diabetes se obtuvieron de dos fuentes: a) un dispositivo de registro electrónico automático y b) registros en papel.

El dispositivo automático tenía un reloj interno para los eventos de marca de tiempo, mientras que los registros en papel solo proporcionaban espacios de "tiempo lógico" (desayuno, almuerzo, cena, hora de acostarse).

Para los registros en papel, se asignaron horarios fijos para el desayuno (08:00), el almuerzo (12:00), la cena (18:00) y la hora de acostarse (22:00). Por lo tanto, los registros en papel tienen tiempos de grabación uniformes ficticios, mientras que los registros electrónicos tienen marcas de tiempo más realistas.

Los archivos de diabetes constan de cuatro campos por registro. Cada campo está separado por una pestaña y cada registro está separado por una nueva línea.

Nombres de archivos y formato

(1) Fecha en formato MM-DD-YYYY (2) Tiempo en formato XX: YY (3) Código (4) Valor

El campo Código se descifra de la siguiente manera:

33 = Dosis de insulina regular

34 = dosis de insulina NPH

35 = dosis de insulina UltraLente

48 = medición de glucosa en sangre no especificada

57 = medición de glucosa en sangre no especificada

58 = medición de glucosa en sangre antes del desayuno

59 = medición de glucosa en sangre después del desayuno

60 = medición de glucosa en sangre antes del almuerzo

61 = medición de glucosa en sangre después del almuerzo

62 = medición de glucosa en sangre antes de la cena

63 = medición de glucosa en sangre después de la cena

64 = Medición de glucosa en sangre antes del aperitivo

65 = Síntomas de hipoglucemia

66 = Ingestión típica de comida

67 = Ingestión de comida más de lo habitual

68 = ingesta de comida menos de lo usual

69 = actividad típica de ejercicio

70 = Actividad de ejercicio más de lo habitual

71 = actividad de ejercicio menos de lo


In [None]:
# Cargamos los datos del fichero CSV
diabetes_df = pd.read_csv('../../data/06_diabetes.csv')

# Imprimimos los datos cargados con pandas
#diabetes_df.info()
diabetes_df

In [None]:
# separamos datos en data y target
diabetes_data   = diabetes_df.drop(['diagnosis'], axis=1)
diabetes_target = diabetes_df.diagnosis

In [None]:
# Separamos los Datos.... Entrenamiento y test
#?  train_test_split()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(diabetes_data, diabetes_target,
                                                    test_size=0.33,
                                                    random_state=None,
                                                    shuffle =None)

print('Set de datos para Entrenamiento =',len(X_train))
print('Set de datos para Test',len(X_test))
print('Total',len(X_test)+len(X_train))
print('Data Shape=',X_test.shape)
print('Target Shape =',y_test.shape)

X_train.head()
#X_train.columns

In [None]:
# Create two lists for training and test accuracies
training_accuracy = []
test_accuracy = []

# Define a range of 1 to 10 (included) neighbors to be tested
neighbors_settings = range(1,20)

# Loop with the KNN through the different number of neighbors to determine the most appropriate (best)
for n_neighbors in neighbors_settings:
    clf = KNeighborsClassifier(n_neighbors=n_neighbors,
                               algorithm='auto',
                               weights='uniform')
    clf.fit(X_train, y_train)
    training_accuracy.append(clf.score(X_train, y_train))
    test_accuracy.append(clf.score(X_test, y_test))

# Visualize results - to help with deciding which n_neigbors yields the best results (n_neighbors=6, in this case)
plt.plot(neighbors_settings, training_accuracy, label='Accuracy of the training set', marker='o')
plt.plot(neighbors_settings, test_accuracy, label='Accuracy of the test set', marker='o')
plt.ylabel('Accuracy')
plt.xlabel('Number of Neighbors')
plt.legend()

In [None]:
clf = KNeighborsClassifier(n_neighbors=15,weights='uniform', algorithm='auto')
clf.fit(X_train, y_train)
print ("Score with data Tes",clf.score(X_test,y_test))
print ("Score with data Train",clf.score(X_train,y_train))

# ========= Alguna Predicción ========

In [None]:
ind = 78
print(diabetes_data.iloc[ind])
print('specie',diabetes_target.iloc[ind], le.inverse_transform(diabetes_target.iloc[[ind]]))
x_new = diabetes_data.ix[ind]

print('\n======== PREDICTION ========')
prediction = clf.predict([x_new.values])
prediction_pb = clf.predict_proba([x_new.values])
print('Specie prediction',prediction, le.inverse_transform(prediction))
print('Probability Specie prediction',prediction_pb)

# ============ Classification Report============

In [None]:
p = clf.predict(X_test)

print ('Accuracy:', accuracy_score(y_test, p))
print ('\nConfusion Matrix:\n', confusion_matrix(y_test, p))
print ('\nClassification Report:', classification_report(y_test, p))
