Probamos incluyendo de nuevo la columna Delay, pero pasándole primero una transformación logarítmica a aquellas columnas con una distribución asimétrica.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Preprocessing
from sklearn.preprocessing import MinMaxScaler

# Train, Test
from sklearn.model_selection import train_test_split

# Metricas para regresiones
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Regresores
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Validacion
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold

In [None]:
df = pd.read_csv("Data/MeneosCompleto.csv", delimiter=";")

pd.set_option("display.precision", 7)

df.head(3)

In [None]:
def limpieza_datos(df):
    #Limpieza de duplicados
    df.drop_duplicates()

    #Elimina las filas con valores NaN de las columnas que no son Clicks.
    lista_columnas = df.columns.to_list()
    lista_columnas.remove("Clicks")

    for columna in lista_columnas:
        lista_indices = df[df[columna].isna() == True].index.tolist()
        df = df.drop(index = lista_indices)
    
    # Creamos la columna delay que es el tiempo que transcurre entre que se envía el meneo y se publica
    df["Delay"] = df["Publicado"] - df["Enviado"]
    
    #Eliminación de columnas innecesarias
    df = df.drop(columns=["Medio", "Publicado", "Enviado", "Positivos", "Anonimos"], axis=1)
    
    # Separamos nuestro dataframe en uno de train, test y otro de predicción
    df_tt = df[df["Clicks"].isna() == False]

    df_pred = df[df["Clicks"].isna() == True]
    df_pred.to_csv("Data/Datos_Pred.csv", index=False)

    # Eliminación de outliers
    # df_tt = df_tt[df_tt["Meneos"].between(50, 3000)]
    df_tt = df_tt[df_tt["Negativos"] <= 13]
    df_tt = df_tt[df_tt["Comentarios"].between(10, 150)]
    df_tt = df_tt[df_tt["Karma"].between(230, 700)]
    # df_tt = df_tt[df_tt["Clicks"].between(263, 37946)]


    # Guardamos nuestro dataframe de train, test
    df_tt.to_csv("Data/Datos_TrainTest.csv", index=False)

    return(df_tt)

In [None]:
df_tt = limpieza_datos(df)

In [None]:
df_tt.head()

## OUTLIERS ##

In [None]:
def graficas_box(df_tt, feature, transform = None):
    if transform == None:
        return(px.histogram(df[feature], marginal="box"))
    else:
        return(px.histogram(df[feature].apply(transform), marginal="box"))

In [None]:
def outliers_log(feature):
    global df_tt
    stats = df_tt[feature].apply(np.log).describe()
    q1 = stats["25%"]
    q3 = stats["75%"]
    ric = q3 - q1

    lim_inf = q1 - 1.5*ric
    lim_sup = q3 + 1.5*ric

    df_tt[feature] = df_tt[feature].apply(np.log)
    return(df_tt[df_tt[feature].between(lim_inf, lim_sup)])

# Meneos #

In [None]:
graficas_box(df_tt, "Meneos")

In [None]:
graficas_box(df_tt, "Meneos", transform=np.log)

In [None]:
df_tt.shape

In [None]:
df_tt = outliers_log("Meneos")

In [None]:
df_tt.shape

# Negativos #

In [None]:
graficas_box(df_tt, "Negativos")

In [None]:
graficas_box(df_tt, "Negativos", transform = np.log)

In [None]:
df_tt = outliers_log("Negativos")

# Comentarios # 

In [None]:
graficas_box(df_tt, "Comentarios")

In [None]:
graficas_box(df_tt, "Comentarios", transform = np.log)

In [None]:
df_tt = outliers_log("Comentarios")

In [None]:
df_tt.shape

# Karma #

In [None]:
graficas_box(df_tt, "Karma")

In [None]:
graficas_box(df_tt, "Karma", transform = np.log)

In [None]:
df_tt = outliers_log("Karma")

In [None]:
df_tt.shape

## Clicks ##

In [None]:
graficas_box(df_tt, "Clicks")

In [None]:
graficas_box(df_tt, "Clicks", transform = np.log)

In [None]:
df_tt = outliers_log("Clicks")

In [None]:
df_tt.shape

In [None]:
df_tt["Negativos"].value_counts()

## ***VISUALIZACIONES***

In [None]:
plt.figure(figsize = (12, 8))
sns.heatmap(data = df_tt._get_numeric_data().corr(), annot = True)
plt.show()

## PREPROCESAMIENTO ##

In [None]:
df_tt.head(3)

In [None]:
modelos = [LinearRegression(),
            KNeighborsRegressor(),
            # RadiusNeighborsRegressor(),
            DecisionTreeRegressor(),
            RandomForestRegressor(),
            SVR(),
            AdaBoostRegressor(),
            GradientBoostingRegressor()]

In [None]:
# train, test split

X = df_tt.iloc[:, 2:].drop(["Clicks"], axis = 1)
y = df_tt[["Clicks"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

x_scaler = MinMaxScaler()
X_train = x_scaler.fit_transform(X_train)
X_test = x_scaler.transform(X_test)

y_scaler = MinMaxScaler()
y_train = y_scaler.fit_transform(y_train)
y_test = y_scaler.transform(y_test)

datos_modelos = list()

for model in modelos:

    model.fit(X_train, y_train.ravel())
    yhat = model.predict(X_test)

    y_test_inv = np.exp(y_scaler.inverse_transform(y_test.reshape(-1, 1)))
    yhat_inv = np.exp(y_scaler.inverse_transform(yhat.reshape(-1, 1)))

    #Metricas
    mae = mean_absolute_error(y_test_inv, yhat_inv)
    mse = mean_squared_error(y_test_inv, yhat_inv)
    r2 = r2_score(y_test_inv, yhat_inv)

    datos_modelos.append([str(model).strip("()"), model, mae, mse, r2])

df_modelo = pd.DataFrame(data = datos_modelos, columns = ["name", "model", "mae", "mse", "r2"])

df_modelo.sort_values("r2", ascending = False)



In [None]:
np.exp(0.1254725)

In [None]:
y_train.shape

In [None]:
type(y_train)

In [None]:
for i in datos_modelos:
    datos_modelos[i][1]

In [None]:
import pickle
import os

try:
    os.mkdir("modelos_clicks")
except: pass

# Guardamos el mejor modelo y los escaladores

for i in range(len(datos_modelos)):
    modelo = datos_modelos[i][1]

    with open(file = f"modelos_clicks/{str(modelo)}.pkl", mode = "bw") as file:
        pickle.dump(modelo, file)
    
    print(f"modelos_clicks/{str(modelo)}.pkl")

In [None]:
str(model)