In [1]:
# Tools
import os
import time
import shutil
import random
from argparse import Namespace
import matplotlib.pyplot as plt

# Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk import ngrams
from nltk.tokenize import TweetTokenizer
from nltk import FreqDist
import pandas as pd
import numpy as np

# PyTorch
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F

# scikit-learn
from sklearn.metrics import accuracy_score

In [2]:
sedd = 1111
random.seed(sedd)
np.random.seed(sedd)
torch.manual_seed(sedd)
torch.backends.cudnn.benchmark = False # ????


In [3]:
x_train = pd.read_csv("data/mex/mex20_train.txt", sep='\r\n', engine='python', header=None).loc[:,0].values.tolist()
x_val = pd.read_csv("data/mex/mex20_val.txt", sep='\r\n', engine='python', header=None).loc[:,0].values.tolist()
print("N√∫mero de ejemplos de entrenamiento:", len(x_train))
print(x_train[:10])
print("N√∫mero de ejemplos de validaci√≥n:", len(x_val))
print(x_val[:10])

N√∫mero de ejemplos de entrenamiento: 5278
['@USUARIO @USUARIO @USUARIO Q se puede esperar del maricon de closet de la Ya√±ez aun recuerdo esa ves q lo vi en zona rosa viendo quien lo levantada', '@USUARIO La piel nueva siempre arde un poquito los primeros d√≠as... y m√°s con este puto clima', 'Ustedes no se enamoran de m√≠‚Ä¶ por tontas.', 'Me las va a pagar esa puta gorda roba tuits...', '@USUARIO LA GENTE ES TONTA PORQUE NO SE DAN CUENTA QUE T√ö HACES A BATMAN AZUL', 'Estoy muy encabronada con las pseudo feministas por tontas e iletradas, a veces me averg√ºenza ser mujer; preferir√≠a tener un falo. #NiUnaMas', 'Anden putos, recuerdan el #noerapenal #Holanda fuera de #Rusia2018, esto se llama #karma ehhhhhhhh #puuuuuutos', 'Si no tienen chichis no traten de ense√±ar se ven muy mal y m√°s cuando son prietas.', 'Ojal√° asi me agarrars cuando te digo que me voy en lugar de correrme a la verga cada 5 minutos.', '@USUARIO @USUARIO @USUARIO @USUARIO Es solo un HDP aprovechado y que su "Dio

In [4]:
args = Namespace()
args.N = 4

In [5]:
lista_excluidas = set(['.', ',', ';', ':', '!', '?', '¬ø', '¬°', '"', "'", '(', ')', '[', ']', '{', '}', '-', '_', '‚Äî', '...',
                       '@', '#', '$', '%', '^', '&', '*', '/', '|', '~', '`', '<', '>', '¬´', '¬ª', '‚Äú', '‚Äù', '‚Äò', '‚Äô','<url>','<@usuario>',
                       
                       ])

class NgramData:
    def __init__(self, N: int, vocab_size: int, tokenizer = None, embeddinds_model = None):
        self.tokenizer = tokenizer if tokenizer is not None else self.default_tokenizer
        self.punct = lista_excluidas
        self.N = N
        self.vocab_size = vocab_size
        self.UNK = "<unk>"
        self.SOS = "<s>"
        self.EOS = "</s>"
        self.embeddinds_model = embeddinds_model # TODO: implementar
    
        
    def default_tokenizer(self, text: str) -> list:
        return text.split()


    def remove_word(self, word: str) -> bool:
        word = word.lower()
        is_punct = word in self.punct
        is_digit = word.isdigit()
        return is_punct or is_digit
    

    def get_vocab(self, corpus: list) -> set:
        freq_dist = FreqDist()
        for sentence in corpus:
            tokens = self.tokenizer(sentence)
            tokens = [token.lower() for token in tokens if not self.remove_word(token)]
            freq_dist.update(tokens)
        most_common = freq_dist.most_common(self.vocab_size - 3)  # Tengo que reservar espacio para UNK, SOS, EOS
        vocab = set([word for word, _ in most_common])
        return vocab


    def fit(self, corpus: list) -> None:
        self.vocab = self.get_vocab(corpus)
        self.vocab.add(self.UNK)
        self.vocab.add(self.SOS)
        self.vocab.add(self.EOS)
        
        self.word_to_id = {}
        self.id_to_word = {}
        
        if self.embeddinds_model is not None:
            self.embedding_matriz = np.empty([len(self.vocab), self.embeddinds_model.vector_size])
            
        id = 0
        for doc in corpus:
            for word in self.tokenizer(doc):
                word_ = word.lower()
                if word_ in self.vocab and  word_ not in self.word_to_id:
                    self.word_to_id[word_] = id
                    self.id_to_word[id] = word_
                    
                    if self.embeddinds_model is not None:
                        if word_ in self.embeddinds_model:
                            self.embedding_matriz[id] = self.embeddinds_model[word_]
                        else:
                            self.embedding_matriz[id] = np.random.normal(self.embeddinds_model.vector_size)
                    id += 1

        self.word_to_id.update({self.UNK: id, self.SOS: id + 1, self.EOS: id + 2})
        self.id_to_word.update({id: self.UNK, id + 1: self.SOS, id + 2: self.EOS})
    
    
    def replace_unk(self, doc_tokens: list) -> list:
        for i, token in enumerate(doc_tokens):
            if token.lower() not in self.vocab:
                doc_tokens[i] = self.UNK
        return doc_tokens
    
    
    def get_ngram_doc(self, doc: str) -> list:
        doc_tokens = self.tokenizer(doc)
        doc_tokens = self.replace_unk(doc_tokens)
        doc_tokens = [w.lower() for w in doc_tokens]
        doc_tokens = [self.SOS] * (self.N - 1) + doc_tokens + [self.EOS]
        return list(ngrams(doc_tokens, self.N))
    
    
    def transform(self, corpus: list) -> tuple[np.ndarray, np.ndarray]:
        x_ngrams = []
        y = []
        
        for doc in corpus:
            doc_ngram = self.get_ngram_doc(doc)
            for words_in_window in doc_ngram:
                words_in_window_ids = [self.word_to_id[w] for w in words_in_window]
                x_ngrams.append(list(words_in_window_ids[:-1]))
                y.append(words_in_window_ids[-1])
        
        return np.array(x_ngrams), np.array(y)
    
    
    # =========== PROPOEDADES ===========    
    @property
    def size(self) -> int:
        return len(self.vocab)
        
                    
    

In [6]:
tk = TweetTokenizer()

ngram_data = NgramData(args.N, 5_000, tokenizer=tk.tokenize)
ngram_data.fit(x_train)


In [7]:
print(f"Tama√±o del vocabulario: {ngram_data.size:,}")

Tama√±o del vocabulario: 5,000


In [8]:
x_ngram_train, y_ngram_train = ngram_data.transform(x_train)
x_ngram_val, y_ngram_val = ngram_data.transform(x_val)


In [9]:
x_ngram_train

array([[4998, 4998, 4998],
       [4998, 4998,    0],
       [4998,    0,    0],
       ...,
       [4997,  937,   32],
       [ 937,   32, 2524],
       [  32, 2524, 4997]])

In [10]:
y_ngram_train

array([   0,    0,    0, ..., 2524, 4997, 4999])

In [11]:
# Tama√±os de los ngrams
x_train_shape = x_ngram_train.shape
y_train_shape = y_ngram_train.shape

x_val_shape = x_ngram_val.shape
y_val_shape = y_ngram_val.shape
print("TAMA√ëO DE LOS NGRAMS DE ENTRENAMIENTO")
print(f"x_ngram_train: {x_train_shape}")
print(f"y_ngram_train: {y_train_shape}")
print("TAMA√ëO DE LOS NGRAMS DE VALIDACI√ìN")
print(f"x_ngram_val: {x_val_shape}")
print(f"y_ngram_val: {y_val_shape}")

TAMA√ëO DE LOS NGRAMS DE ENTRENAMIENTO
x_ngram_train: (102751, 3)
y_ngram_train: (102751,)
TAMA√ëO DE LOS NGRAMS DE VALIDACI√ìN
x_ngram_val: (11558, 3)
y_ngram_val: (11558,)


nota: creo que los tama√±os varian segun la lista de palabras excluidas que tengo

In [12]:
lista_palabras = [[ngram_data.id_to_word[w]  for w in tw] for tw in x_ngram_train[:22]]
for i, palabras in enumerate(lista_palabras):
    print(f"{i+1}: {palabras}")

1: ['<s>', '<s>', '<s>']
2: ['<s>', '<s>', '@usuario']
3: ['<s>', '@usuario', '@usuario']
4: ['@usuario', '@usuario', '@usuario']
5: ['@usuario', '@usuario', 'q']
6: ['@usuario', 'q', 'se']
7: ['q', 'se', 'puede']
8: ['se', 'puede', 'esperar']
9: ['puede', 'esperar', 'del']
10: ['esperar', 'del', 'maricon']
11: ['del', 'maricon', 'de']
12: ['maricon', 'de', 'closet']
13: ['de', 'closet', 'de']
14: ['closet', 'de', 'la']
15: ['de', 'la', 'ya√±ez']
16: ['la', 'ya√±ez', 'aun']
17: ['ya√±ez', 'aun', 'recuerdo']
18: ['aun', 'recuerdo', 'esa']
19: ['recuerdo', 'esa', 'ves']
20: ['esa', 'ves', 'q']
21: ['ves', 'q', 'lo']
22: ['q', 'lo', 'vi']


In [13]:
y_ngram_train

array([   0,    0,    0, ..., 2524, 4997, 4999])

In [14]:
lista_palbras_en_sus_ys = [ngram_data.id_to_word[w] for w in y_ngram_train[:22]]
for i, palabra in enumerate(lista_palbras_en_sus_ys):
    print(f"{i+1}: {palabra}")

1: @usuario
2: @usuario
3: @usuario
4: q
5: se
6: puede
7: esperar
8: del
9: maricon
10: de
11: closet
12: de
13: la
14: ya√±ez
15: aun
16: recuerdo
17: esa
18: ves
19: q
20: lo
21: vi
22: en


In [15]:
args.batch_size = 64
args.num_workers = 2
DTYPE = torch.int64

def tensor_dataset(x: np.ndarray, y: np.ndarray) -> TensorDataset:
    tensor_data = TensorDataset(
        torch.tensor(x, dtype=DTYPE),
        torch.tensor(y, dtype=DTYPE))
    return tensor_data
    
def data_loader(dataset: TensorDataset, shuffle: bool) -> DataLoader:
    dataloader = DataLoader(dataset, 
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        shuffle=shuffle)
    return dataloader
    
    
# Crear los DataLoaders
train_dataset = tensor_dataset(x_ngram_train, y_ngram_train)

train_loader = data_loader(train_dataset, shuffle=True)

val_dataset = tensor_dataset(x_ngram_val, y_ngram_val)

val_loader = data_loader(val_dataset, shuffle=False)