In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

# Preprocessing
from sklearn.preprocessing import MinMaxScaler

# Train, Test
from sklearn.model_selection import train_test_split

# Clasificadores
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier, NearestCentroid
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

# Metricas para Clasificadores
from sklearn.metrics import jaccard_score, accuracy_score, recall_score, confusion_matrix, roc_auc_score, f1_score, precision_score

# Metodos de validacion
from sklearn.model_selection import GridSearchCV

In [None]:
# Cargamos nuestro dataframe

df = pd.read_csv("Data/MeneosCompleto.csv", delimiter=";")

df.head(3)

In [None]:
def limpieza_datos(df, output = "train"):
    #Limpieza de duplicados
    df.drop_duplicates()

    #Elimina las filas con valores NaN de las columnas que no son Clicks.
    lista_columnas = df.columns.to_list()
    lista_columnas.remove("Clicks")

    for columna in lista_columnas:
        lista_indices = df[df[columna].isna() == True].index.tolist()
        df = df.drop(index = lista_indices)

    # Creamos la columna delay que es el tiempo que transcurre entre que se envía el meneo y se publica
    df["Delay"] = df["Publicado"] - df["Enviado"]
    
    #Eliminación de columnas innecesarias
    df = df.drop(columns=["Medio", "Positivos", "Anonimos", "Publicado", "Enviado"], axis=1)
    
    # Separamos nuestro dataframe en uno de train, test y otro de predicción
    df_pred = df[df["Clicks"].isna() == True]
    df_tt = df[df["Clicks"].isna() == False]

    # Eliminación de outliers
    df_tt = df_tt[df_tt["Meneos"].between(50, 3000)]
    df_tt = df_tt[df_tt["Negativos"] <= 13]
    df_tt = df_tt[df_tt["Comentarios"].between(10, 150)]
    df_tt = df_tt[df_tt["Karma"].between(230, 700)]
    df_tt = df_tt[df_tt["Delay"].between(155, 69761)]

    if output == "train":
        return(df_tt)
    else:
        df_pred = df_pred.drop("Clicks", axis = 1)
        return(df_pred)

In [None]:
# Le aplicamos la función de limpieza de datos 

df_tt = limpieza_datos(df)

df_tt.head(3)

In [None]:
# Visualizacion de la columna clicks para decidir en cuántos grupos dividirla

px.histogram(df_tt["Clicks"], marginal="box")

In [None]:
df_tt["Clicks"].describe()

In [None]:
# Definimos una función para dividir la categoría clicks

stats_clicks = df_tt["Clicks"].describe()
median = stats_clicks["50%"]

def clusters_clicks(x):
    if x <= median:
        return(0)
    else:
        return(1)

In [None]:
# Le aplicamos la función a la columna "Clicks"

df_tt["Clicks"] = df_tt["Clicks"].apply(lambda x: clusters_clicks(x))

In [None]:
df_tt

## CLASIFICACION ##

In [None]:
clasificadores = [KNeighborsClassifier(),
                  RadiusNeighborsClassifier(radius=0.5),
                  NearestCentroid(),
                  LogisticRegression(),
                  GaussianNB(),
                  DecisionTreeClassifier(),
                  RandomForestClassifier(),
                  SVC(),
                  AdaBoostClassifier(algorithm = "SAMME"),
                  GradientBoostingClassifier()]

X = df_tt.iloc[:, 2:].drop("Clicks", axis = 1)
y = np.array(df_tt["Clicks"])

x_scaler_class = MinMaxScaler()
X = x_scaler_class.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

datos_clf = list()

for clf in clasificadores:
    
    clf.fit(X_train, y_train)
    
    yhat = clf.predict(X_test)
    
    jac = jaccard_score(y_test, yhat)
    acc = accuracy_score(y_test, yhat)
    rec = recall_score(y_test, yhat)
    cmx = confusion_matrix(y_test, yhat)
    roc = roc_auc_score(y_test, yhat)
    f1_ = f1_score(y_test, yhat)
    pre = precision_score(y_test, yhat)
    
    datos_clf.append([str(clf), clf, jac, acc, rec, cmx, roc, f1_, pre])
    
df_metrics = pd.DataFrame(data = datos_clf, columns = ["name", "clf", "jaccard", "accuracy", "recall",
                                                       "confusion_matrix", "roc_auc", "f1_score", "precision"])

df_metrics.sort_values("roc_auc", ascending = False)

## Validación ##

In [None]:
# Preprocesamiento

X = df_tt.iloc[:, 2:].drop("Clicks", axis = 1)
y = np.array(df_tt["Clicks"])

x_scaler_class = MinMaxScaler()
X = x_scaler_class.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

# Modelo
GradientBoosting = GradientBoostingClassifier()

# Parametros a iterar
parametros = {
    'n_estimators': [200, 300],
    'learning_rate': [0.01, 0.05, 0.2],
    'max_depth': [4, 5, 6],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4]    
}

# Metricas
scorers = ["accuracy", "jaccard", "f1", "precision", "recall", "roc_auc"]

# GridSearchCV
grid_solver = GridSearchCV(estimator  = GradientBoosting,
                           param_grid = parametros,
                           scoring    = scorers,
                           cv         = 2,
                           refit      = "roc_auc",
                           n_jobs     = -1,
                           verbose    = 3)

# Resultados
model_result = grid_solver.fit(X, y)

In [None]:
# Guardamos el modelo entrenado

best_GradientBoosting = model_result.best_estimator_

## Predicción ##

In [None]:
df_pred = limpieza_datos(df, output="pred")
df_pred.head()

In [None]:
# Preprocesamiento

X = df_pred.iloc[:, 2:]

x_scaler_class = MinMaxScaler()
X = x_scaler_class.fit_transform(X)

# Predicción
y_hat_final = best_GradientBoosting.predict(X)

# Añadimos predicción al df
df_pred["Clicks_clase"] = y_hat_final    
df_pred["Clicks"] = df_pred["Clicks_clase"].apply(lambda x: f"{int(median)} clicks o menos" if 0 else f"Más de {int(median)} clicks")
df_pred.head()