## Trabajo final módulo NLP

#### Clasificación multietiqueta de los tweets del archivo sem_eval_train.csv respecto a 11 sentimientos:
- anger
- anticipation
- disgust
- fear
- joy
- love
- optimism
- pessimism
- sadness
- surprise
- trust

#### Cada tweet puede pertenecer a varias clases

### En este notebook probaré un modelo sencillo usando una RNN para un clasificador Multi clase.

In [5]:
import pandas as pd
import re, string, spacy
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, SpatialDropout1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [6]:
RNN_DIM = 50
RNN_layer = LSTM

In [7]:
df = pd.read_csv('sem_eval_train_es.csv', index_col='ID')

df

Unnamed: 0_level_0,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-Es-01643,@aliciaenp Ajajjaa somos del clan twitteras pe...,False,False,False,False,True,False,False,False,False,False,False
2018-Es-05142,@AwadaNai la mala suerte del gato fichame la c...,False,False,False,True,False,False,False,True,False,False,False
2018-Es-05379,@audiomano A mí tampoco me agrado mucho eso. E...,True,False,False,False,False,False,False,False,False,False,False
2018-Es-00208,Para llevar a los bebes de un lugar a otro deb...,False,False,False,False,True,False,False,False,False,False,False
2018-Es-01385,@DalasReview me encanta la terrible hipocresia...,True,False,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
2018-Es-06340,Ahorita quisiera que mi preocupación más grand...,False,False,False,True,False,False,False,True,True,False,False
2018-Es-00439,El mayor criminal del país diciéndole “delincu...,True,False,True,False,False,False,False,False,False,False,False
2018-Es-04919,Mi prima de 4 años se ha enfadado conmigo porq...,True,False,False,False,False,False,False,False,False,True,False
2018-Es-02703,@lennycia Jajaja... Ya seee,False,False,False,False,True,False,False,False,False,False,False


In [8]:
nlp = spacy.load('es_core_news_md')

pattern2 = re.compile('[{}]'.format(re.escape(string.punctuation)))

def clean_text(text, lemas=False):
    text = re.sub(r'@[\w_]+|https?://[\w_./]+', '', text)
    tokens = nlp(text)
    tokens = [tok.lemma_.lower() if lemas else tok.lower_ for tok in tokens if not tok.is_punct]
    filtered_tokens = [pattern2.sub('', tok) for tok in tokens]
    filtered_text = ' '.join(filtered_tokens)

    return filtered_text

In [9]:
df.Tweet = df.Tweet.apply(clean_text, lemas = True)
df[df['Tweet'] != '']

label_columns = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 
                 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']
df[label_columns] = df[label_columns].astype(int)
Y = df[label_columns].values

# Separar los datos en entrenamiento y prueba
tweets_train, tweets_test, Y_train, Y_test = train_test_split(df.Tweet, Y, 
    test_size=0.2, random_state=0)

In [10]:
cv = CountVectorizer(min_df=2)
cv.fit(tweets_train)
max_features = len(cv.get_feature_names_out())
max_features

2003

In [11]:
MAX_NB_WORDS = max_features+2
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, split = ' ', oov_token='OOV')
tokenizer.fit_on_texts(tweets_train.values)

X_train = tokenizer.texts_to_sequences(tweets_train.values)
X_train = pad_sequences(X_train, padding='post')

word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique tokens.')

MAX_SEQUENCE_LENGTH = X_train.shape[1]
X_test = tokenizer.texts_to_sequences(tweets_test.values)
X_test = pad_sequences(X_test, padding='post', maxlen=MAX_SEQUENCE_LENGTH)

Found 6378 unique tokens.


In [12]:
nlp = spacy.load('es_core_news_md')

EMBREDDING_DIM = nlp.vocab.vectors_length
embedding_matrix = np.zeros((MAX_NB_WORDS, EMBREDDING_DIM))
vectores = 0

for word, i in word_index.items():
    if(i<MAX_NB_WORDS):
        if nlp.vocab[word].has_vector:
            embedding_matrix[i] = nlp.vocab[word].vector
            vectores += 1

In [13]:
embedding_layer = Embedding(MAX_NB_WORDS,
                            EMBREDDING_DIM,
                            weights = [embedding_matrix],
                            input_length = MAX_SEQUENCE_LENGTH,
                            trainable = False,
                            mask_zero = True)

model = Sequential()
model.add(embedding_layer)
model.add(SpatialDropout1D(0.4))
model.add(LSTM(RNN_DIM, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(11, activation='sigmoid'))
model.compile(loss='binry_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 32, 300)           601500    
                                                                 
 spatial_dropout1d (SpatialD  (None, 32, 300)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 50)                70200     
                                                                 
 dense (Dense)               (None, 11)                561       
                                                                 
Total params: 672,261
Trainable params: 70,761
Non-trainable params: 601,500
_________________________________________________________________
None


2024-09-05 19:41:34.390250: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-09-05 19:41:34.390404: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-09-05 19:41:34.390459: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2024-09-05 19:41:34.390496: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2024-09-05 19:41:34.390530: W tensorflow/c