In [73]:
# Basics
import numpy as np
import pandas as pd

# Visualización
import seaborn as sns
import matplotlib.pyplot as plt

import re
from nltk.tokenize import TweetTokenizer

import joblib
from scipy import stats
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report

In [4]:
def colapsar_repeticion(match):
    elemento = match.groups()
    assert len(match.groups()) == 1
    return elemento[0]

def procesar_tweet(tweet):
    tweet = tweet.lower()
    regex_usuario = re.compile(r"@[\w\d]+")
    tweet = regex_usuario.sub("@usuario", tweet)
    
    regex_link = re.compile(r"\b(?:https?://|www\.)\S+\b")
    tweet = regex_link.sub("<link>", tweet)
    
    tokenizer = TweetTokenizer(reduce_len=True)
    tokens = tokenizer.tokenize(tweet)
    
    regex_collapse = re.compile(r"(\w)\1{2}")
    
    tokens = [regex_collapse.sub(colapsar_repeticion, token) for token in tokens]
    
    return tokens

In [None]:
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [5]:
df_train = pd.read_csv("../datos/training/EXIST2021_training.tsv", sep="\t")
df_train

Unnamed: 0,test_case,id,source,language,text,task1,task2
0,EXIST2021,1,twitter,en,"She calls herself ""anti-feminazi"" how about sh...",sexist,ideological-inequality
1,EXIST2021,2,twitter,en,"Now, back to these women, the brave and the be...",non-sexist,non-sexist
2,EXIST2021,3,twitter,en,"@CurvyBandida @Xalynne_B Wow, your skirt is ve...",sexist,objectification
3,EXIST2021,4,twitter,en,@AurelieGuiboud Incredible! Beautiful!But I l...,non-sexist,non-sexist
4,EXIST2021,5,twitter,en,i find it extremely hard to believe that kelly...,non-sexist,non-sexist
...,...,...,...,...,...,...,...
6972,EXIST2021,6973,twitter,es,"Estamos igual sin pareja, pero puedes besar a ...",non-sexist,non-sexist
6973,EXIST2021,6974,twitter,es,2020 hijo de re mil putas,non-sexist,non-sexist
6974,EXIST2021,6975,twitter,es,SEGURAMENTE ESTA CHICA NO COBRA EL DINERO QUE ...,non-sexist,non-sexist
6975,EXIST2021,6976,twitter,es,@safetyaitana mi madre dice q va fea y i agree,sexist,objectification


In [43]:
tweets = df_train["text"].apply(procesar_tweet)
labels1 = df_train["task1"].map({"sexist":1, "non-sexist":0}).values

# Train - test

In [44]:
train_tweets, test_tweets, train_labels, test_labels = train_test_split(tweets, labels1, test_size=0.2)

In [45]:
len(train_tweets)

5581

In [46]:
vectorizer = TfidfVectorizer(min_df=2, tokenizer=lambda x:x, preprocessor=lambda x:x)
X_train = vectorizer.fit_transform(train_tweets)
y_train = train_labels
X_test = vectorizer.transform(test_tweets)
y_test = test_labels

In [48]:
def entrenar_modelo(Modelo, model_params, X_train, X_test, y_train, y_test, 
                    search_space, n_iter):
    # Entrenando y optimizando
    model = Modelo(**model_params)
    tunned_model = RandomizedSearchCV(model, search_space, n_iter=50, n_jobs=-1)
    tunned_model.fit(X_train, y_train)
    print(tunned_model.best_params_)
    
    # Evaluación
    y_pred = tunned_model.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    return tunned_model

In [None]:
dataset = [X_train, X_test, y_train, y_test]

In [80]:
search_space = {"penalty":["l1", "l2"], 
                "C":stats.uniform(1, 99)}
model_lr = entrenar_modelo(LogisticRegression, {"max_iter":500, "random_state":42}, 
                            *dataset, search_space, n_iter=50)

{'C': 4.693727617375979, 'penalty': 'l2'}
              precision    recall  f1-score   support

           0       0.75      0.75      0.75       763
           1       0.70      0.70      0.70       633

    accuracy                           0.73      1396
   macro avg       0.73      0.73      0.73      1396
weighted avg       0.73      0.73      0.73      1396



In [81]:
joblib.dump(modelo_lr, "../modelos/logistic_regression.joblib")

['../modelos/logistic_regression.joblib']

In [66]:
search_space = {"kernel":["rbf", "sigmoid", "poly"], 
                "gamma":["scale", "auto"], 
                "degree":[2,3,4],
                "C":stats.uniform(0.1, 99.9)}
model_svc = entrenar_modelo(SVC, {"max_iter":500, "random_state":42}, 
                            *dataset, search_space, n_iter=50)



{'C': 33.49372824812848, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.64      0.62      0.63       763
           1       0.56      0.58      0.57       633

    accuracy                           0.60      1396
   macro avg       0.60      0.60      0.60      1396
weighted avg       0.61      0.60      0.60      1396



In [69]:
search_space = {"criterion" : ["gini", "entropy"], 
                "max_depth" : stats.randint(50, 201), 
                "max_features" : [None, "sqrt", "log2"],
                "min_samples_split" : stats.randint(2, 11)}
model_dtc = entrenar_modelo(DecisionTreeClassifier, {"random_state":42}, *dataset, search_space, n_iter=50)

{'criterion': 'entropy', 'max_depth': 59, 'max_features': None, 'min_samples_split': 2}
              precision    recall  f1-score   support

           0       0.70      0.73      0.71       763
           1       0.66      0.61      0.63       633

    accuracy                           0.68      1396
   macro avg       0.68      0.67      0.67      1396
weighted avg       0.68      0.68      0.68      1396



In [71]:
search_space = {"n_estimators" : stats.randint(80, 201),
                "criterion" : ["gini", "entropy"], 
                "max_depth" : stats.randint(50, 201), 
                "max_features" : [None, "sqrt", "log2"],
                "min_samples_split" : stats.randint(2, 11)}
model_rfc = entrenar_modelo(RandomForestClassifier, {"random_state":42}, *dataset, search_space, n_iter=50)

{'criterion': 'entropy', 'max_depth': 137, 'max_features': 'sqrt', 'min_samples_split': 3, 'n_estimators': 159}
              precision    recall  f1-score   support

           0       0.74      0.75      0.74       763
           1       0.69      0.68      0.69       633

    accuracy                           0.72      1396
   macro avg       0.72      0.71      0.71      1396
weighted avg       0.72      0.72      0.72      1396



In [76]:
joblib.dump(model_rfc, "../modelos/random_forest.joblib")

['../modelos/random_forest.joblib']

In [47]:
%%time
# Entrenamos modelo
Modelo = LogisticRegression
model_params = {"max_iter":500}
model = Modelo(**model_params)

# Búsqueda de hiperparámetros
search_space = {"penalty":["l1", "l2"], 
                "C":stats.uniform(1, 99)}
tunned_model = RandomizedSearchCV(model, search_space, n_iter=50, n_jobs=-1)
tunned_model.fit(X_train, y_train)
print(tunned_model.best_params_)

# Evaluación en conjunto de prueba
y_pred = tunned_model.predict(X_test)
print(classification_report(y_test, y_pred))

{'C': 7.831687642388317, 'penalty': 'l2'}
              precision    recall  f1-score   support

           0       0.75      0.75      0.75       763
           1       0.70      0.69      0.69       633

    accuracy                           0.72      1396
   macro avg       0.72      0.72      0.72      1396
weighted avg       0.72      0.72      0.72      1396

CPU times: user 984 ms, sys: 28 ms, total: 1.01 s
Wall time: 6.98 s


---------------

In [34]:
tweets_training, tweets_test, y_training, y_test = train_test_split(tweets, labels1, test_size=0.2)
tweets_train, tweets_val, y_train, y_val = train_test_split(tweets_training, y_training, test_size=0.25)

In [35]:
vectorizer = TfidfVectorizer(min_df=2, tokenizer=lambda x:x, preprocessor=lambda x:x)
X_train = vectorizer.fit_transform(tweets_train)
X_val = vectorizer.transform(tweets_val)
X_test = vectorizer.transform(tweets_test)

In [42]:
%%time
# Entrenamos modelo
Modelo = LogisticRegression
model_params = {"max_iter":500}
model = Modelo(**model_params)
model.fit(X_train, y_train)
print("Terminó entrenamiento")

# Búsqueda de hiperparámetros
search_space = {"penalty":["l1", "l2"], 
                "C":stats.uniform(1, 99)}
tunned_model = RandomizedSearchCV(model, search_space, n_iter=50, n_jobs=-1)
tunned_model.fit(X_val, y_val)
print(tunned_model.best_params_)

# Evaluación en conjunto de prueba
y_pred = tunned_model.predict(X_test)
print(classification_report(y_test, y_pred))

Terminó entrenamiento
{'C': 4.367401011018379, 'penalty': 'l2'}
              precision    recall  f1-score   support

           0       0.69      0.69      0.69       704
           1       0.69      0.69      0.69       692

    accuracy                           0.69      1396
   macro avg       0.69      0.69      0.69      1396
weighted avg       0.69      0.69      0.69      1396

CPU times: user 1.15 s, sys: 76 µs, total: 1.15 s
Wall time: 3.47 s
