### Modelos a entrenar

1. MAquinas de soporte vectorial SVM
2. Bosques Aleatorios RF

In [2]:
from pprint import pprint
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer

from sklearn.model_selection import train_test_split

from sklearn.metrics \
import classification_report, recall_score, accuracy_score,precision_score, make_scorer,confusion_matrix
from sklearn.model_selection import GridSearchCV


In [87]:

VIOLENT = "violent"
NONVIOLENT = "non-violent"

df = pd.read_csv("../master_data/data.csv")
# mapping label to 1 -> violent 0 -> non-violent
df["label"] = df["label"].astype(str).apply(lambda x: VIOLENT if x == "1" else NONVIOLENT)



In [88]:
df.head()

Unnamed: 0,feature,label,tweet_id,origin
0,es sexy.,violent,,rnn
1,eres mía.,violent,,rnn
2,la gorda.,violent,,rnn
3,a fregar.,violent,,rnn
4,ca- gona.,violent,,rnn


In [89]:
df.label.value_counts()

label
violent        8689
non-violent    3028
Name: count, dtype: int64

The dataset is imbalanced, lets balance it out. 

In [90]:
violent = df.loc[df["label"] == VIOLENT]
nonviolent = df.loc[df["label"] == NONVIOLENT]
violent_patched = violent.sample(nonviolent.shape[0],random_state=0)

In [91]:
#balanced dataset
bdf = pd.concat([violent_patched,nonviolent])
bdf.label.value_counts()

label
violent        3028
non-violent    3028
Name: count, dtype: int64

Split into test and train data. 

In [116]:
X_train, X_test, y_train, y_test =\
train_test_split(df.loc[:,["feature"]],df["label"], test_size= 0.3, random_state= 1)

Define pipelines, 
bdf is balanced dataset

In [117]:
# numeric_features = ["length", "punct"]
categorical_features = ["feature"]

# numeric_transformer = Pipeline(
#     steps=[("scaler", StandardScaler())]
# )

categorical_transformer = Pipeline(
    steps=[
        ("squeez", FunctionTransformer(lambda x: x.squeeze())), # make sure you pass a series
        ("tfidf",TfidfVectorizer())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        # ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

classifierRF = Pipeline(
    [
        ("preprocessor",preprocessor),
        ("classifier",RandomForestClassifier(n_estimators=100,random_state=0))
    ]
)

classifierSVC = Pipeline(
    [
        ("preprocessor",preprocessor),
        ("svc",svm.SVC(random_state=0))
    ]
)


In [123]:
param_grid_svc = {
    "svc__C": [1, 10, 100],
    "svc__kernel": ['linear', 'rbf', 'sigmoid'],
    "svc__gamma": ["auto","scale"]
}
best_svc = GridSearchCV(classifierSVC, param_grid_svc,n_jobs=8,cv=5,scoring=make_scorer(precision_score,pos_label=VIOLENT,zero_division="warn"))

Unnamed: 0,feature
5964,"A rey muerto, rey puesto."
5377,Un caballero se avergüenza de que sus palabras...
4664,La vida es como un arca inmensa llena de posib...
7970,@InesArrimadas Ahh pero montapollos aún no ha ...
5390,"Las falsedades no sólo se oponen a la verdad, ..."
...,...
5297,"La razón me dice que Dios existe, pero también..."
3630,"Mañana es una excusa maravillosa, ¿No crees?"
10030,@unworthyunicor2 @michiki_ta No con cualquier ...
10584,@LaFallaras Si eso se les da de puta madre que...


In [None]:
best_svc.fit(X_train, y_train)
print("using precission as the Best parameter to evaluate on (CV score=%0.3f):" % best_svc.best_score_)
print(best_svc.best_params_)

In [None]:
y_pred_svc = best_svc.predict(X_test)
y_pred_svc_train = best_svc.predict(X_train)
report_svc_train = classification_report(y_train, y_pred_svc_train)
print(report_svc_train)
report_svc = classification_report(y_test, y_pred_svc)
print(report_svc)

In [121]:
confusion_matrix(y_train, y_pred_svc_train)

array([[2038,   77],
       [   2, 6084]])

In [122]:
pprint(confusion_matrix(y_test, y_pred_svc))

array([[ 650,  263],
       [  60, 2543]])
