In [30]:
import pandas as pd
import re

# NLP
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Train, Test
from sklearn.model_selection import train_test_split

# Regresores
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Metricas para regresiones
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
df_completo = pd.read_csv("Data/100178_Comentarios.csv", sep=";")

In [20]:
df = df_completo[["texto", "karma"]]
df = df.dropna()
df.reset_index(inplace=True, drop=True)
df["karma"] = df["karma"].astype("int32")
df.head()

Unnamed: 0,texto,karma
0,"O que les den de hostias hasta hartarse, son t...",77
1,"#1 Sea cual sea la opción, seguro que preferir...",35
2,"Un saludo Mysto, ¡hermoso!",51
3,Es tal es desproposito y la barbarie que no ca...,52
4,"#1 Es el problema, uno no quiere rebajarse. Pe...",33


In [21]:
stopwords = nltk.corpus.stopwords.words("spanish")

def limpieza_comentarios(comentario):
    comentario = re.sub(r'[^a-zA-Z0-9\sáéíóúñçÁÉÍÓÚäëïöüàèìòù]', ' ', comentario)
    comentario = re.sub(r'\d+', '', comentario)    
    tokens_limpios = [palabra for palabra in comentario.split(" ") if palabra.lower() not in stopwords and len(palabra) > 2]
    return ' '.join(tokens_limpios)

In [22]:
df["texto"] = df["texto"].apply(limpieza_comentarios)
df.head()

Unnamed: 0,texto,karma
0,den hostias hartarse tan despreciables consegu...,77
1,opción seguro preferirán muertos cerdo llega s...,35
2,saludo Mysto hermoso,51
3,tal desproposito barbarie cabe cosa disolucion...,52
4,problema quiere rebajarse llega punto piensas ...,33


In [23]:
# Inicializamos un objeto CountVecrtorizer()
count_vectorizer = CountVectorizer()

# Entrenamos el modelo y transformamos los datos.
bag = count_vectorizer.fit_transform(df["texto"])

bag

<99360x103226 sparse matrix of type '<class 'numpy.int64'>'
	with 1570030 stored elements in Compressed Sparse Row format>

In [24]:
# Inicializamos un objeto Tfidf
tfidf = TfidfTransformer()

# Entrenamos el Tfidf y transformamos la variable bag
bag_tfidf = tfidf.fit_transform(bag)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(bag_tfidf, # X
                                                    df["karma"].values, # y
                                                    test_size = 0.3, random_state = 42)

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape},  y_test: {y_test.shape}")

X_train: (69552, 103226), y_train: (69552,)
X_test: (29808, 103226),  y_test: (29808,)


In [31]:
modelos = [LinearRegression(),
            KNeighborsRegressor(),
            # RadiusNeighborsRegressor(),
            DecisionTreeRegressor(),
            # RandomForestRegressor(),
            SVR(),
            AdaBoostRegressor(),
            GradientBoostingRegressor()]

In [32]:
# model = LinearRegression()
datos_modelos = list()

for model in modelos:
    print (f"#### Modelo: {str(model)} ####")
    model.fit(X_train, y_train)
    yhat = model.predict(X_test)

    #Metricas
    mae = mean_absolute_error(y_test, yhat)
    mse = mean_squared_error(y_test, yhat)
    r2 = r2_score(y_test, yhat)
    datos_modelos.append([str(model).strip("()"), model, mae, mse, r2])

df_modelo = pd.DataFrame(data = datos_modelos, columns = ["name", "model", "mae", "mse", "r2"])
df_modelo.sort_values("r2", ascending = False)

#### Modelo: LinearRegression() ####
#### Modelo: KNeighborsRegressor() ####
#### Modelo: DecisionTreeRegressor() ####
#### Modelo: SVR() ####
#### Modelo: AdaBoostRegressor() ####
#### Modelo: GradientBoostingRegressor() ####


Unnamed: 0,name,model,mae,mse,r2
5,GradientBoostingRegressor,([DecisionTreeRegressor(criterion='friedman_ms...,28.533331,3734.574064,-0.008733
3,SVR,SVR(),22.818303,3924.834136,-0.060124
1,KNeighborsRegressor,KNeighborsRegressor(),25.74068,3957.799159,-0.069028
2,DecisionTreeRegressor,DecisionTreeRegressor(),33.581051,6629.940295,-0.790791
0,LinearRegression,LinearRegression(),127.199373,47521.64685,-11.835911
4,AdaBoostRegressor,"(DecisionTreeRegressor(max_depth=3, random_sta...",450.312302,206366.135646,-54.740856
