In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

# Preprocessing
from sklearn.preprocessing import MinMaxScaler

# Train, Test
from sklearn.model_selection import train_test_split

# Clasificadores
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier, NearestCentroid
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

# Metricas para Clasificadores
from sklearn.metrics import jaccard_score, accuracy_score, recall_score, confusion_matrix, f1_score, precision_score

In [2]:
# Cargamos nuestro dataframe

df = pd.read_csv("Data/MeneosCompleto.csv", delimiter=";")

df.head(3)

Unnamed: 0,Titular,Medio,URL,Enviado,Publicado,Meneos,Clicks,Positivos,Anonimos,Negativos,Comentarios,Karma
0,Que arresten a los líderes de Israel por críme...,ctxt.es,https://old.meneame.net/story/arresten-lideres...,1715372000.0,1715379000.0,81,111.0,51.0,30.0,4.0,11.0,463.0
1,El pelotazo de los curas salesianos con una pi...,elsaltodiario.com,https://old.meneame.net/story/pelotazo-curas-s...,1715368000.0,1715378000.0,73,336.0,43.0,30.0,3.0,20.0,441.0
2,Países Bajos no participa en el segundo ensayo...,hoy.es,https://old.meneame.net/story/paises-bajos-no-...,1715374000.0,1715376000.0,123,1275.0,72.0,51.0,3.0,39.0,460.0


In [3]:
def limpieza_datos(df):
    #Limpieza de duplicados
    df.drop_duplicates()

    #Elimina las filas con valores NaN de las columnas que no son Clicks.
    lista_columnas = df.columns.to_list()
    lista_columnas.remove("Clicks")

    for columna in lista_columnas:
        lista_indices = df[df[columna].isna() == True].index.tolist()
        df = df.drop(index = lista_indices)

    # Creamos la columna delay que es el tiempo que transcurre entre que se envía el meneo y se publica
    df["Delay"] = df["Publicado"] - df["Enviado"]
    
    #Eliminación de columnas innecesarias
    df = df.drop(columns=["Medio", "Positivos", "Anonimos", "Publicado", "Enviado"], axis=1)
    
    # Separamos nuestro dataframe en uno de train, test y otro de predicción
    df_pred = df[df["Clicks"].isna() == True]
    df_tt = df[df["Clicks"].isna() == False]

    # Eliminación de outliers
    df_tt = df_tt[df_tt["Meneos"].between(50, 3000)]
    df_tt = df_tt[df_tt["Negativos"] <= 13]
    df_tt = df_tt[df_tt["Comentarios"].between(10, 150)]
    df_tt = df_tt[df_tt["Karma"].between(230, 700)]
    df_tt = df_tt[df_tt["Delay"].between(155, 69761)]
    
    return(df_tt)

In [4]:
# Le aplicamos la función de limpieza de datos 

df_tt = limpieza_datos(df)

df_tt.head(3)

Unnamed: 0,Titular,URL,Meneos,Clicks,Negativos,Comentarios,Karma,Delay
0,Que arresten a los líderes de Israel por críme...,https://old.meneame.net/story/arresten-lideres...,81,111.0,4.0,11.0,463.0,7655.0
1,El pelotazo de los curas salesianos con una pi...,https://old.meneame.net/story/pelotazo-curas-s...,73,336.0,3.0,20.0,441.0,10093.0
2,Países Bajos no participa en el segundo ensayo...,https://old.meneame.net/story/paises-bajos-no-...,123,1275.0,3.0,39.0,460.0,1683.0


In [None]:
# Visualizacion de la columna clicks para decidir en cuántos grupos dividirla

px.histogram(df_tt["Clicks"], marginal="box")

In [8]:
df_tt["Clicks"].describe()

count    137866.000000
mean       4612.925979
std        4578.209910
min         111.000000
25%        1683.000000
50%        3057.000000
75%        5782.000000
max      239917.000000
Name: Clicks, dtype: float64

In [9]:
# Definimos una función para dividir la categoría clicks

stats_clicks = df_tt["Clicks"].describe()

def clusters_clicks(x):
    if stats_clicks["min"] <= x < stats_clicks["25%"]:
        return(0)
    elif stats_clicks["25%"] <= x < stats_clicks["50%"]:
        return(1)
    else:
        return(2)

In [10]:
# Le aplicamos la función a la columna "Clicks"

df_tt["Clicks"] = df_tt["Clicks"].apply(lambda x: clusters_clicks(x))

In [11]:
df_tt

Unnamed: 0,Titular,URL,Meneos,Clicks,Negativos,Comentarios,Karma,Delay
0,Que arresten a los líderes de Israel por críme...,https://old.meneame.net/story/arresten-lideres...,81,0,4.0,11.0,463.0,7655.0
1,El pelotazo de los curas salesianos con una pi...,https://old.meneame.net/story/pelotazo-curas-s...,73,0,3.0,20.0,441.0,10093.0
2,Países Bajos no participa en el segundo ensayo...,https://old.meneame.net/story/paises-bajos-no-...,123,0,3.0,39.0,460.0,1683.0
3,Todos los israelíes están entrenados en hasbar...,https://old.meneame.net/story/todos-israelies-...,117,0,5.0,24.0,410.0,13713.0
4,Cuando Einstein llamó “fascistas” a los gobern...,https://old.meneame.net/story/cuando-einstein-...,260,0,2.0,17.0,461.0,26628.0
...,...,...,...,...,...,...,...,...
193672,"Yu Chan, la primera tortuga biónica devuelta a...",https://old.meneame.net/story/yu-chan-primera-...,290,2,1.0,37.0,536.0,31884.0
193674,Un mayordomo que aparca la bici y la deja niqu...,https://old.meneame.net/story/mayordomo-aparca...,667,2,0.0,59.0,655.0,5404.0
193675,¿Un melenudo nueva imagen de Pantene?,https://old.meneame.net/story/melenudo-nueva-i...,1220,2,4.0,113.0,605.0,17777.0
193678,En todo el Sistema Solar,https://old.meneame.net/story/en-todo-el-siste...,400,2,0.0,18.0,654.0,8895.0


## CLASIFICACION ##

In [12]:
# Creamos una lista con los modelos que vamos a utilizar

clasificadores = [KNeighborsClassifier(),
                  RadiusNeighborsClassifier(radius=0.5),
                  NearestCentroid(),
                  LogisticRegression(),
                  GaussianNB(),
                  DecisionTreeClassifier(),
                  RandomForestClassifier(),
                  SVC(),
                  AdaBoostClassifier(algorithm = "SAMME"),
                  GradientBoostingClassifier()]

# Definimos nuestro conjunto de datos de entrenamiento 

X = df_tt.iloc[:, 2:].drop("Clicks", axis = 1)
y = np.array(df_tt["Clicks"])

# Normalización de los datos y separación en train, test
x_scaler_class = MinMaxScaler()
X = x_scaler_class.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

datos_clf = list()

# Bucle con todos los modelos y guardado de las métricas

for clf in clasificadores:
    
    clf.fit(X_train, y_train)
    
    yhat = clf.predict(X_test)
    
    jac = jaccard_score(y_test, yhat, average = "macro")
    acc = accuracy_score(y_test, yhat)
    rec = recall_score(y_test, yhat, average = "macro")
    cmx = confusion_matrix(y_test, yhat)
    f1_ = f1_score(y_test, yhat, average = "macro")
    pre = precision_score(y_test, yhat, average = "macro")
    
    datos_clf.append([str(clf), clf, jac, acc, rec, cmx, f1_, pre])
    
df_metrics = pd.DataFrame(data = datos_clf, columns = ["name", "clf", "jaccard", "accuracy", "recall",
                                                       "confusion_matrix", "f1_score", "precision"])

df_metrics.sort_values("accuracy", ascending = False)


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.





Unnamed: 0,name,clf,jaccard,accuracy,recall,confusion_matrix,f1_score,precision
9,GradientBoostingClassifier(),([DecisionTreeRegressor(criterion='friedman_ms...,0.278133,0.544317,0.432752,"[[2544, 246, 4101, 0], [1562, 346, 4986, 0], [...",0.397325,0.479994
8,AdaBoostClassifier(),"(DecisionTreeClassifier(max_depth=1, random_st...",0.276682,0.537354,0.431498,"[[2672, 300, 3919, 0], [1737, 358, 4799, 0], [...",0.396503,0.462459
7,SVC(),SVC(),0.249948,0.536121,0.406702,"[[1995, 31, 4865, 0], [1172, 44, 5678, 0], [10...",0.353454,0.481117
6,RandomForestClassifier(),"(DecisionTreeClassifier(max_features='sqrt', r...",0.285789,0.516719,0.429706,"[[2441, 1070, 3380, 0], [1643, 1084, 4167, 0],...",0.420796,0.438928
3,LogisticRegression(),LogisticRegression(),0.241144,0.515703,0.394518,"[[1980, 47, 4864, 0], [1474, 118, 5302, 0], [1...",0.346368,0.43137
1,RadiusNeighborsClassifier(),RadiusNeighborsClassifier(),0.166691,0.500073,0.333333,"[[0, 0, 6891, 0], [0, 0, 6894, 0], [0, 0, 1378...",0.222244,0.166691
4,GaussianNB(),GaussianNB(),0.251537,0.499347,0.420496,"[[3594, 47, 3250, 0], [2750, 28, 4116, 0], [36...",0.360901,0.413676
2,NearestCentroid(),NearestCentroid(),0.274782,0.467034,0.435471,"[[3907, 1160, 1824, 0], [3110, 1225, 2559, 0],...",0.416362,0.427209
0,KNeighborsClassifier(),KNeighborsClassifier(),0.273432,0.465402,0.420558,"[[2959, 1515, 2417, 0], [2253, 1602, 3039, 0],...",0.416325,0.415687
5,DecisionTreeClassifier(),DecisionTreeClassifier(),0.244193,0.42366,0.382202,"[[2257, 1970, 2664, 0], [1973, 1869, 3052, 0],...",0.382081,0.382228
