# Cargar Librerías Pandas / Scikit-Learn

In [7]:
import pandas as pd
# Importar algoritmos de la librería sklearn
from sklearn.neighbors import KNeighborsClassifier

# Importamos datos en un Dataframe

In [8]:
df_data = pd.read_excel(r'C:\Users\Iván\Dropbox\Creación de MOCs\MOC Machine Learning con Python\Casos Prácticos ML\Clasificación\iris.xlsx')

In [9]:
df_data.head()

Unnamed: 0,long_sepalo,anch_sepalo,long_petalo,anch_petalo,clase
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
long_sepalo    150 non-null float64
anch_sepalo    150 non-null float64
long_petalo    150 non-null float64
anch_petalo    150 non-null float64
clase          150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB


# Crear un array (dataframe) con las variables de entrada (X) y otro para la variable de salida (y)

In [12]:
X = df_data.drop('clase', axis=1).values
y = df_data['clase'].values

# Dividir datos en conjunto de "Training" (ej: 80%) y conjunto de "Test" (ej:20%)

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y) #stratify --> datos_etiquetados

# Crear Clasificador (Ejemplo KNN)

In [14]:
knn = KNeighborsClassifier(n_neighbors=8) #por defecto n_neighbors=5

# Entrenar el modelo en base a los datos

In [15]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=8, p=2,
           weights='uniform')

# Verificar precisión del modelo a partir del conjunto de TEST

In [16]:
print(knn.score(X_test, y_test)) #Precisión solo en un subconjunto del 20% sin hacer cross-validation

0.9666666666666667


# Predecir Resultados de salida (y_prediction) a partir de nuevos datos de entrada (X_new)

In [19]:
df_new = pd.read_excel(r'C:\Users\Iván\Dropbox\Creación de MOCs\MOC Machine Learning con Python\Casos Prácticos ML\Clasificación\iris_nuevos_datos.xlsx')
X_new = df_new.values

In [20]:
y_prediction = knn.predict(X_new)
print("Prediccion: {}".format(y_prediction))

Prediccion: ['Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa']


# Optimizar modelo

In [21]:
import numpy as np
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors': np.arange(1, 50)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv=5) #CV = cross validation en 5 partes
knn_cv.fit(X, y)
knn_cv.best_params_
knn_cv.best_score_

0.98

In [22]:
knn_cv.best_params_

{'n_neighbors': 6}

# Flujo Completo CLASIFICACIÓN

In [23]:
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Importamos datos en un Dataframe

In [26]:
df_data = pd.read_excel(r'C:\Users\Iván\Dropbox\Creación de MOCs\MOC Machine Learning con Python\Casos Prácticos ML\Clasificación\iris.xlsx')

In [27]:
df_data.head()

Unnamed: 0,long_sepalo,anch_sepalo,long_petalo,anch_petalo,clase
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# Crear un array (dataframe) con las variables de entrada (X) y otro para la variable de salida (y)

In [28]:
X = df_data.drop('clase', axis=1).values
y = df_data['clase'].values

# Dividir datos en conjunto de "Training" (ej: 80%) y conjunto de "Test" (ej:20%)

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y) #stratify --> datos_etiquetados

# Construir Modelos en base a los diferentes algoritmos

In [30]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

# Evaluar cada modelo

In [31]:
results = []
names = []
for name, model in models:
 kfold = model_selection.KFold(n_splits=10, random_state=42)
 cv_results = model_selection.cross_val_score(model, X, y, cv=kfold) #No sería incluso necesaria haber separado previamente en training y test
 results.append(cv_results)
 names.append(name)
 msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
 print(msg)



LR: 0.880000 (0.148474)
LDA: 0.966667 (0.061464)
KNN: 0.933333 (0.084327)
CART: 0.946667 (0.071802)
NB: 0.946667 (0.058119)
SVM: 0.953333 (0.052068)




# Seleccionar mejor modelo tras benchmarking

In [33]:
svc = SVC()

# Optimizar modelo

In [34]:
import numpy as np
from sklearn.model_selection import GridSearchCV

Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
svc_cv = GridSearchCV(svc, param_grid, cv=5)
svc_cv.fit(X, y)
svc_cv.best_params_
svc_cv.best_score_

0.98

# Predecir Resultados de salida (y_prediction) a partir de nuevos datos de entrada (X_new)

In [35]:
df_new = pd.read_excel(r'C:\Users\Iván\Dropbox\Creación de MOCs\MOC Machine Learning con Python\Casos Prácticos ML\Clasificación\iris_nuevos_datos.xlsx')
X_new = df_new.values

In [36]:
y_prediction = svc_cv.predict(X_new)
print("Prediccion: {}".format(y_prediction))

Prediccion: ['Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa'
 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa']
