# Import Libraries

In [36]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score,recall_score, f1_score, confusion_matrix
from sklearn.utils import resample
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection

# Load Datasets

In [3]:
data_train = pd.read_parquet('../data/processed/unbalanced_data_train.parquet')

#### Split data in 30/70 

In [4]:
label = {'Style'}
columns_set = set(data_train.columns.values)
x = data_train[list(columns_set-label)]
y = data_train[list(label)]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

# Machine Learning
* Select technique 

In [16]:
model_params = {
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,50,100]
        }
    },
    'kneighbors_classifier':{
        'model': KNeighborsClassifier(),
        'params': {
            'algorithm': ['ball_tree', 'kd_tree']
        }
    }
}

Before we do any experiment with the classifiers, we need to treat the unbalanced classes

In [17]:
scores = []

for model_name, mp in model_params.items():
    clf1 =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=True)
    print(f'---- training {model_name} ----')
    clf1.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf1.best_score_,
        'best_params': clf1.best_params_
        
    })
    
models_info_data_frame = pd.DataFrame(scores,columns=['model','best_score','best_params'])
models_info_data_frame

---- training random_forest ----


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


---- training kneighbors_classifier ----


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


Unnamed: 0,model,best_score,best_params
0,random_forest,0.47495,{'n_estimators': 100}
1,kneighbors_classifier,0.308449,{'algorithm': 'ball_tree'}


After seing that the two algorithms that we have here, are having the best score less than 50% we decided to balanced the target column categories. 

## Resampling the Target column

In [23]:
count_df = data_train.groupby(['Style'])['Style'].count()

styles_unbalanced = [k for k, v in count_df.items() if v <= 100]

In [26]:
from sklearn.utils import resample


df_sampled =pd.DataFrame()
for j in styles_unbalanced:
    
    df_minority_j = data_train[data_train.Style==j]
    df_minority_upsampled = resample(df_minority_j, 
                                 replace=True,     
                                 n_samples=400,    
                                 stratify= df_minority_j,
                                 random_state=123)
    df_sampled = pd.concat([df_sampled, df_minority_upsampled])
     

In [28]:
data_train = pd.concat([data_train, df_sampled])


In [29]:
label = {'Style'}
columns_set = set(data_train.columns.values)
x = data_train[list(columns_set-label)]
y = data_train[list(label)]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [30]:
scores = []

for model_name, mp in model_params.items():
    clf1 =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=True)
    print(f'---- training {model_name} ----')
    clf1.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf1.best_score_,
        'best_params': clf1.best_params_
        
    })
    
models_info_data_frame = pd.DataFrame(scores,columns=['model','best_score','best_params'])
models_info_data_frame

---- training random_forest ----


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


---- training kneighbors_classifier ----


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


Unnamed: 0,model,best_score,best_params
0,random_forest,0.732217,{'n_estimators': 100}
1,kneighbors_classifier,0.610904,{'algorithm': 'ball_tree'}


After trating the unbalanced data, we can see the algorithms score increase significantly. Also we choose the random_forest classifier.

In [35]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
score = accuracy_score(y_test, y_pred)
print('Accuracy: {}'.format(score))

print('Rand accuracy score is {:4f}'.format(accuracy_score(y_test, y_pred)))
print('Rand Precision score: ', precision_score(y_test, y_pred, average='micro'))
print('Rand Recall score: ', recall_score(y_test, y_pred, average='micro'))

  clf.fit(X_train, y_train)


Accuracy: 0.7536603064839715
Rand accuracy score is 0.753660
Rand Precision score:  0.7536603064839715
Rand Recall score:  0.7536603064839715


# k-fold Cross Validation

Se hace seleccion de los mejores modelos usando el Training Set y k-fold Cross Validation

In [37]:
kFold = model_selection.KFold(n_splits=10)
scoring = 'accuracy'
score = (model_selection.cross_val_score(clf, X_train, y_train,  scoring = scoring, cv = kFold))
print (f"( {score.mean()}, {score.std()})")

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


( 0.7389132821075741, 0.00548752888553776


# Evaluacion final del modelo con el Test set

Tomar los parametros obtenidos en el paso anterior, se crea el modelo con esos pararmetros y se entrena el modelo con todos los datos del **Train -set**

Finalmente se realiza la evaluacion (segun su problema si es de regresion o de clasificacion) usando el **Test - set** para definir si el modelo obtenido esta bien. Compare los resultados con el **Train -set** vs los resultados con el **Test - set**

In [38]:
X = data_train.drop('Style', axis=1)
y = data_train['Style']

In [None]:
clf_final = RandomForestClassifier(n_estimators=100)
clf.fit(X, y)

y_pred = clf.predict(X_test)
score = accuracy_score(y_test, y_pred)
print('Accuracy: {}'.format(score))

print('Rand accuracy score is {:4f}'.format(accuracy_score(y_test, y_pred)))
print('Rand Precision score: ', precision_score(y_test, y_pred, average='micro'))
print('Rand Recall score: ', recall_score(y_test, y_pred, average='micro'))

# Implementacion del Modelo (Deploying)
Con el análisis básico y el ajuste hecho, comienza el trabajo real (ingeniería).

El último paso para poner en produccion el modelo de prediccion sera:
1. Entrenarlo en todo el conjunto de datos nuevamente, para hacer un uso completo de todos los datos disponibles. 
2. Usar los mejores parámetros encontrados mediante la validación cruzada, por supuesto. Esto es muy similar a lo que hicimos al principio, pero esta vez teniendo una idea de su comportamiento y estabilidad. La evaluación se realizó con honestidad, en divisiones distintas de entrenamiento / prueba.

El predictor final se puede serializar y grabar en el disco, de modo que la próxima vez que lo usemos, podemos omitir todo el entrenamiento y usar el modelo capacitado directamente:

In [7]:
#import pickle # Esta es una libreria de serializacion nativa de python, puede tener problemas de seguridad
from joblib import dump # libreria de serializacion

# garbar el modelo en un archivo
#dump(Modelo_final, 'Nombre_Archivo_Modelo.joblib')

# Comunicacion de Resultados (Data Story Telling)

# Conclusiones

# Ayudas Y Referencias

- https://medium.com/@joserzapata/paso-a-paso-en-un-proyecto-machine-learning-bcdd0939d387
- [Proyecto de Principio a Final sobre readmision de pacientes con Diabetes](https://github.com/JoseRZapata/Readmission-ML-Project)

- [a-complete-machine-learning-walk-through-in-python-part-one](https://towardsdatascience.com/a-complete-machine-learning-walk-through-in-python-part-one-c62152f39420)


- [a-starter-pack-to-exploratory-data-analysis-with-python-pandas-seaborn-and-scikit-learn](https://towardsdatascience.com/a-starter-pack-to-exploratory-data-analysis-with-python-pandas-seaborn-and-scikit-learn-a77889485baf#249d)

- [a-data-science-for-good-machine-learning-project-walk-through-in-python-part-one](https://towardsdatascience.com/a-data-science-for-good-machine-learning-project-walk-through-in-python-part-one-1977dd701dbc)

- [Ejemplos de Kaggle](https://www.kaggle.com/kernels?sortBy=hotness&group=everyone&pageSize=20&language=Python&kernelType=Notebook)

- [END to END ML from data colletion to deployment](https://medium.com/datadriveninvestor/end-to-end-machine-learning-from-data-collection-to-deployment-ce74f51ca203)

Docente: [Jose R. Zapata](https://joserzapata.github.io)
- https://joserzapata.github.io
- https://twitter.com/joserzapata
- https://www.linkedin.com/in/jose-ricardo-zapata-gonzalez/   