![](data/banner.png)
# Objetivos
+ Crear un modelo exportable de aprendizaje supervisado usando clasificación con el algoritmo Naive Bayes

# Librerias

In [1]:
# Declaración de la semilla
seed = 161

# Para trabajar los datos
import pandas as pd
import numpy as np

# Para visualizar los datos
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

# Vectorizar
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#Split our dataset
from sklearn.model_selection import train_test_split

# Metricas
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix

# Para búsqueda de hiperparámetros
from sklearn.model_selection import GridSearchCV
# Para la validación cruzada
from sklearn.model_selection import KFold 
# Para el uso de pipelines
from imblearn.pipeline import Pipeline

# Importar/ Exportar modelos
import joblib

# Modelo

Cargamos los datos

In [2]:
# Cargamos el dataset
file = "train"
df=pd.read_csv(f'data/{file}_preprocessed.csv', sep=',', encoding = 'utf-8')
df = df[["Emotion","processed_message"]]

In [3]:
df.head()

Unnamed: 0,Emotion,processed_message
0,sadness,humy humiliate
1,sadness,go hopeless damn hop around someon car awak go...
2,anger,grab minut post greedy wrong grab minute post ...
3,love,ev nostalg fireplac know stil property ever no...
4,anger,grouchy grouchy


Separamos el dataset

In [4]:
X = df["processed_message"]
Y = df['Emotion']

In [5]:
# Creamos el modelo básico
model = Pipeline(steps = [
    ("tf_idv",TfidfVectorizer()),
    ("ovr",OneVsRestClassifier(SVC(kernel="poly")))
])
model

Pipeline(steps=[('tf_idv', TfidfVectorizer()),
                ('ovr', OneVsRestClassifier(estimator=SVC(kernel='poly')))])

In [6]:
param_grid ={
    "ovr__estimator__C": [1,2,3],
    "ovr__estimator__kernel": ["poly","rbf"],
    "ovr__estimator__degree":[1,2],
}

In [7]:
grid_selector = GridSearchCV(estimator = model, param_grid = param_grid, cv=10, n_jobs=-1)

In [8]:
grid_selector.fit(X,Y)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('tf_idv', TfidfVectorizer()),
                                       ('ovr',
                                        OneVsRestClassifier(estimator=SVC(kernel='poly')))]),
             n_jobs=-1,
             param_grid={'ovr__estimator__C': [1, 2, 3],
                         'ovr__estimator__degree': [1, 2],
                         'ovr__estimator__kernel': ['poly', 'rbf']})

In [9]:
final_model = grid_selector.best_estimator_
final_model

Pipeline(steps=[('tf_idv', TfidfVectorizer()),
                ('ovr',
                 OneVsRestClassifier(estimator=SVC(C=1, degree=1,
                                                   kernel='poly')))])

In [10]:
final_model.predict(["it amaz impress"])

array(['surprise'], dtype='<U8')

# Guardamos el modelo

In [11]:
pipeline_file = open("pipeline_OneVsRest.joblib","wb")
joblib.dump(final_model,pipeline_file)
pipeline_file.close()