# Aprendizaje profundo para detección de sexismo
- Óscar Alvarado
- Dante Bermúdez

In [54]:
import numpy as np
import pandas as pd

import re
from nltk.tokenize import TweetTokenizer

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer

from itertools import islice as take
import torch
from torch import nn
# !pip install torchinfo
from torchinfo import summary
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F

# barras de progreso
from tqdm import trange

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [3]:
def colapsar_repeticion(match):
    elemento = match.groups()
    assert len(match.groups()) == 1
    return elemento[0]

def procesar_tweet(tweet):
    tweet = tweet.lower()
    regex_usuario = re.compile(r"@[\w\d]+")
    tweet = regex_usuario.sub("@usuario", tweet)
    
    regex_link = re.compile(r"\b(?:https?://|www\.)\S+\b")
    tweet = regex_link.sub("<link>", tweet)
    
    tokenizer = TweetTokenizer(reduce_len=True)
    tokens = tokenizer.tokenize(tweet)
    
    regex_collapse = re.compile(r"(\w)\1{2}")
    
    tokens = [regex_collapse.sub(colapsar_repeticion, token) for token in tokens]
    
    return tokens

In [4]:
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [5]:
df_train = pd.read_csv("../datos/training/EXIST2021_training.tsv", sep="\t")
df_train.head()

Unnamed: 0,test_case,id,source,language,text,task1,task2
0,EXIST2021,1,twitter,en,"She calls herself ""anti-feminazi"" how about sh...",sexist,ideological-inequality
1,EXIST2021,2,twitter,en,"Now, back to these women, the brave and the be...",non-sexist,non-sexist
2,EXIST2021,3,twitter,en,"@CurvyBandida @Xalynne_B Wow, your skirt is ve...",sexist,objectification
3,EXIST2021,4,twitter,en,@AurelieGuiboud Incredible! Beautiful!But I l...,non-sexist,non-sexist
4,EXIST2021,5,twitter,en,i find it extremely hard to believe that kelly...,non-sexist,non-sexist


In [6]:
tweets = df_train["text"].apply(procesar_tweet)
labels1 = df_train["task1"].map({"sexist":1, "non-sexist":0}).values

## Train - test

In [7]:
train_tweets, test_tweets, train_labels, test_labels = train_test_split(tweets, labels1, test_size=0.2)

In [8]:
vect = CountVectorizer(min_df=2, tokenizer=lambda x:x, preprocessor=lambda x:x)
vect.fit_transform(train_tweets)

<5581x8424 sparse matrix of type '<class 'numpy.int64'>'
	with 135091 stored elements in Compressed Sparse Row format>

In [9]:
word2idx = {word: idx for idx, word in enumerate(vect.get_feature_names(), 2)}
word2idx["<PAD>"] = 0
word2idx["<START>"] = 1
X_train = [[word2idx.get(word, 1) for word in tweet] for tweet in train_tweets]
print(X_train[0])

[5714, 8240, 1645, 4982, 4189, 6389, 5405, 1, 7529, 8234, 4183, 7533, 5429, 5425, 6243, 5462, 3990, 2155, 327]


In [10]:
X_test = [[word2idx.get(word, 1) for word in tweet] for tweet in test_tweets]
print(X_test[0])

[6794, 397, 7632, 4395, 5108, 7585, 6261, 7469, 4615, 4989, 2365, 2213, 5588, 2313, 2927, 6957, 6308, 418, 276, 770, 5261, 276, 8314, 3594, 2213, 2365, 1, 2618, 8312, 7638, 4615, 6261, 6748, 6308, 1, 2227, 7469, 4615, 4989, 2365, 276, 7011, 657, 6748, 5099, 5946, 7648, 2084, 1985]


In [11]:
max_len = max([len(X) for X in X_train])
max_len

110

In [12]:
# Palabras en el vocabulario
len(word2idx)

8426

In [13]:
# Padding
for idx, X in enumerate(X_train):
    zeros = [0 for _ in range(max_len)]
    len_x = len(X)
    zeros[-len_x:] = X
    X_train[idx] = zeros
    
for idx, X in enumerate(X_test):
    zeros = [0 for _ in range(max_len)]
    len_x = len(X)
    zeros[-len_x:] = X
    X_test[idx] = zeros

## Arquitectura

In [43]:
# Definición de arquitectura
class CNN(nn.Module):    
    def __init__(self, num_labels=2):
        super(CNN, self).__init__()
        # Valores iniciales
        num_embeddings = 8438
        embedding_dim = 50
        kernels = 30
        k_cnn = 9
        pad_cnn = 0
        dilation_cnn = 1
        step_cnn = 1
        k_pool = 9
        pad_pool = 0
        dilation_pool = 1
        step_pool = 1
        
        # Capa para Embeddings
        self.emb = nn.Embedding(num_embeddings, embedding_dim) # 8438 palabras en el vocabulario, embedding 50-dimensional
        
        # Capa convolucional
        self.cnn = nn.Sequential(
            nn.Conv1d(in_channels = embedding_dim, out_channels = kernels, kernel_size = k_cnn, padding = pad_cnn,
                     dilation = dilation_cnn, stride = step_cnn),
            # Función de activación
            nn.ReLU(),
            # Pooling
            nn.MaxPool1d(kernel_size = 9, padding = pad_pool,
                     dilation = dilation_pool, stride = step_pool))
        
        # Aplanado
        self.flatten = nn.Flatten()
        
        # Calculando el número de características
        out_cnn = int((110 + 2*pad_cnn - dilation_cnn*(k_cnn - 1) - 1)/step_cnn) + 1
        out_pool = int((out_cnn + 2*pad_pool - dilation_pool*(k_pool - 1) - 1)/step_pool) + 1
        
        self.num_features = kernels*out_pool
        
        #Clasificación
        self.cls = nn.Linear(self.num_features, num_labels)
        
        
    
    # metodo para inferencia
    def forward(self, x):
        x = self.emb(x)
        x = x.permute(0, 2, 1)
        x = self.cnn(x)
        x = self.flatten(x)
        x = self.cls(x)
        return x

In [44]:
model = CNN()
print(model)

CNN(
  (emb): Embedding(8438, 50)
  (cnn): Sequential(
    (0): Conv1d(50, 30, kernel_size=(9,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=9, stride=1, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (cls): Linear(in_features=2820, out_features=2, bias=True)
)


In [45]:
# inferencia con datos sintéticos
x = torch.tensor([X_train[0]])
y = model(x)
print(f'{x.shape} => {y.shape}')

torch.Size([1, 110]) => torch.Size([1, 2])


In [46]:
X_train_t = torch.Tensor(X_train).to(torch.int64) # transform to torch tensor
y_train_t = torch.Tensor(train_labels).to(torch.int64)

trn_dataset = TensorDataset(X_train_t, y_train_t) # create your datset
trn_dl = DataLoader(trn_dataset) # create your dataloader

In [47]:
X_test_t = torch.Tensor(X_test).to(torch.int64) # transform to torch tensor
y_test_t = torch.Tensor(test_labels).to(torch.int64)

tst_dataset = TensorDataset(X_test_t, y_test_t) # create your datset
tst_dl = DataLoader(tst_dataset) # create your dataloader

## Modelo paralelo

In [50]:
def train_epoch_GPU(dl, model, opt):

    # por cada lote
    for x, y_true in dl:
        
        # computamos logits
        y_lgts = model(x.to(torch.int64))
        
        # computamos la pérdida
        loss = F.cross_entropy(y_lgts, y_true)
        
        # vaciamos los gradientes
        opt.zero_grad()
        
        # retropropagamos
        loss.backward()
        
        # actualizamos parámetros
        opt.step()


def eval_epoch_GPU(dl, model, num_batches=None):

    # evitamos que se registren las operaciones 
    # en la gráfica de cómputo
    with torch.no_grad():

        # historiales
        losses, accs = [], []

        # validación de la época con num_batches
        # si num_batches==None, se usan todos los lotes
        for x, y_true in take(dl, num_batches):

            # computamos los logits
            y_lgts = model(x)

            # computamos los puntajes
            y_prob = F.softmax(y_lgts, 1)

            # computamos la clases
            y_pred = torch.argmax(y_prob, 1)

            # computamos la pérdida
            loss = F.cross_entropy(y_lgts, y_true)

            # computamos la exactitud
            acc = (y_true == y_pred).type(torch.float32).mean()

            # guardamos históricos
            losses.append(loss.item())
            accs.append(acc.item())

        # promediamos
        loss = np.mean(losses) * 100
        acc = np.mean(accs) * 100

        return loss, acc
        
        
def train_GPU(model, trn_dl, tst_dl, lr=1e-3, epochs=20,
          trn_batches=None, tst_batches=None):

    # historiales
    loss_hist, acc_hist = [], []
    
    # optimizador
    opt = optim.SGD(model.parameters(), lr=lr)

    # ciclo de entrenamiento
    for epoch in trange(epochs):

        # entrenamos la época
        train_epoch_GPU(trn_dl, model, opt)

        # evaluamos la época en entrenamiento
        trn_loss, trn_acc = eval_epoch_GPU(trn_dl, model, trn_batches)
        # evaluamos la época en prueba
        tst_loss, tst_acc = eval_epoch_GPU(tst_dl, model, tst_batches)

        # guardamos historial
        loss_hist.append([trn_loss, tst_loss])
        acc_hist.append([trn_acc, tst_acc])

        # imprimimos progreso
        print(f'E{epoch:02} '
              f'loss=[{trn_loss:6.2f},{tst_loss:6.2f}] '
              f'acc=[{trn_acc:5.2f},{tst_acc:5.2f}]')

    return loss_hist, acc_hist

In [55]:
%%time
# instanciamos un modelo
model = CNN()
# entrenamos
loss_hist, acc_hist = train_GPU(model, trn_dl, tst_dl, epochs=5)

 20%|██        | 1/5 [00:08<00:33,  8.26s/it]

E00 loss=[ 65.68, 72.80] acc=[57.43,51.36]


 40%|████      | 2/5 [00:15<00:23,  7.84s/it]

E01 loss=[ 52.20, 74.37] acc=[73.89,54.80]


 60%|██████    | 3/5 [00:23<00:15,  7.59s/it]

E02 loss=[ 40.03, 83.67] acc=[81.87,54.23]


 80%|████████  | 4/5 [00:30<00:07,  7.67s/it]

E03 loss=[ 30.13, 97.88] acc=[87.49,53.65]


100%|██████████| 5/5 [00:38<00:00,  7.70s/it]

E04 loss=[ 19.79,108.58] acc=[92.56,53.51]
CPU times: user 2min 22s, sys: 2min 36s, total: 4min 59s
Wall time: 38.5 s



