## NLP - Avaliação Final - Twitter Sentiment Analysis

### Parte 1: Produto e base de dados

In [None]:
# Importando bibliotecas

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import joblib
import time

In [None]:
# Primeira leitura dos dados

df_training = pd.read_csv('twitter_sentiment/twitter_training.csv', names = ['id','theme','sentiment','text'])
df_test = pd.read_csv('twitter_sentiment/twitter_validation.csv', names = ['id','theme','sentiment','text'])
df_training.tail()

In [None]:
frames = [df_training, df_test]
df_join = pd.concat(frames)
df_cleaned = df_join.dropna()
df_join.shape

### Parte 2: Estratégias de Machine Learning

#### 2A - Abordagem tradicional 'baseline'

In [None]:
# Classificador + Vetorizador

n_components = 3
X_train, X_test, y_train, y_test= train_test_split(df_join['text'].values.astype('U'), df_join['sentiment'].values.astype('U'), train_size=0.70)
classificador = Pipeline([
                        ('meu_vetorizador', CountVectorizer(stop_words='english')),
                        ('meu_classificador', LogisticRegression(penalty='l2', solver='lbfgs', max_iter=10000))
                        ])

start_time = time.time()

classificador.fit(X_train,y_train)

end_time = time.time()

elapsed_time = end_time - start_time

print("Time taken by the pipeline: {:.2f} seconds".format(elapsed_time))

joblib.dump(classificador, 'NLP_AF_A.joblib')
y_pred = classificador.predict(X_test)
acc = accuracy_score(y_pred,y_test)
acc

In [None]:
model = joblib.load("NLP_AF_A.joblib")
y_pred = list(model.predict_proba(['I hate Brazil'])[0])
classification = y_pred.index(max(y_pred))
return_class = model.classes_[classification]
return_class

In [None]:
start_time = time.time()

y_pred = classificador.predict_proba(["I enjoyed my weekend"])

print(y_pred)

end_time = time.time()

elapsed_time = end_time - start_time

print("Time taken by the pipeline: {:.2f} seconds".format(elapsed_time))

In [None]:
vocabulario = classificador['meu_vetorizador'].vocabulary_
pesos = classificador['meu_classificador'].coef_
print(pesos.shape)

classe_alvo = 1
classe_alvo_str = classificador.classes_[classe_alvo]

palavras_e_pesos = []
for palavra in vocabulario.keys():
    j = vocabulario[palavra]
    coeficiente = pesos[classe_alvo,j]
    palavras_e_pesos.append( (coeficiente, palavra) )

tuplas_ordenadas = sorted(palavras_e_pesos, reverse=True) # reverse=True pede uma ordenação em ordem decrescente
palavras = [ t[1] for t in tuplas_ordenadas ]
contagens = [ t[0] for t in tuplas_ordenadas ]

n_palavras = 10
eixo_x = np.arange(n_palavras)
plt.figure(figsize=(14,1))
plt.title('Palavras que mais levam a {}'.format(classificador.classes_[classe_alvo]))
plt.bar(eixo_x[0:n_palavras], contagens[0:n_palavras])
plt.xticks(eixo_x[0:n_palavras], palavras[0:n_palavras], rotation=20, fontsize = 12)
plt.ylabel(f'Pesos do regressor\nlogístico')
plt.show()

eixo_x = np.arange(n_palavras)
plt.figure(figsize=(14,1))
plt.title('Palavras que mais afastam de {}'.format(classificador.classes_[classe_alvo]))
plt.bar(eixo_x[-n_palavras:], contagens[-n_palavras:])
plt.xticks(eixo_x[-n_palavras:], palavras[-n_palavras:], rotation=20, fontsize = 12)
plt.ylabel(f'Pesos do regressor\nlogístico')
plt.show()

In [None]:
classificador.classes_

#### 2B - Abordagem com Deep Learning treinada in-house

In [None]:
# Importando bibliotecas - rede neural

from tensorflow.keras.layers import Input, Dense, Activation, TimeDistributed, Lambda, Softmax, TextVectorization, Reshape, RepeatVector, GRU, Conv1D, Bidirectional, AveragePooling1D, UpSampling1D, Embedding, Concatenate, GlobalAveragePooling1D, LSTM, Multiply
from tensorflow.keras.models import Model
import pandas as pd
import os
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
import matplotlib.pyplot as plt


In [None]:
# Divisão dos databases em minibatches
df = df_join
DATASET_DIR = './twitter_dataset'

In [None]:
# # Não rodar novamente
# os.mkdir(DATASET_DIR)
# os.mkdir(DATASET_DIR + "/train")
# os.mkdir(DATASET_DIR + "/train/Positive")
# os.mkdir(DATASET_DIR + "/train/Negative")
# os.mkdir(DATASET_DIR + "/train/Irrelevant")
# os.mkdir(DATASET_DIR + "/train/Neutral")
# os.mkdir(DATASET_DIR + "/test")
# os.mkdir(DATASET_DIR + "/test/Positive")
# os.mkdir(DATASET_DIR + "/test/Negative")
# os.mkdir(DATASET_DIR + "/test/Irrelevant")
# os.mkdir(DATASET_DIR + "/test/Neutral")

In [None]:
# reviews = df['text']
# labels = df['sentiment']
# x_train, x_test, y_train, y_test = train_test_split(reviews, labels, train_size=0.8)

# n_texto = 0
# for i in tqdm(range(len(y_train))):
#     texto = x_train.iloc[i]
#     fname = 'review_' + str(n_texto) + '.txt'
#     with open(DATASET_DIR + "/train/" + y_train.iloc[i] + "/" + fname, 'w', encoding = 'utf-8') as f:
#         f.write(str(texto))
#         n_texto += 1

# for i in tqdm(range(len(y_test))):
#     texto = x_test.iloc[i]
#     fname = 'review_' + str(n_texto) + '.txt'
#     with open(DATASET_DIR + "/test/" + y_test.iloc[i] + "/" + fname, 'w', encoding = 'utf-8') as f:
#         f.write(str(texto))
#         n_texto += 1

# print("Criei textos:", n_texto)

In [None]:
# Ler um dataset e fazer batches
from tensorflow.keras.utils import text_dataset_from_directory

dataset_train = text_dataset_from_directory(
    DATASET_DIR + '/train',
    labels='inferred',
    label_mode='categorical',
    class_names=None,
    batch_size=1024,
    max_length=None,
    shuffle=True,
    seed=None,
    validation_split=None,
    subset=None,
    follow_links=False
)

dataset_test = text_dataset_from_directory(
    DATASET_DIR+ '/test',
    labels='inferred',
    label_mode='categorical',
    class_names=None,
    batch_size=1024,
    max_length=None,
    shuffle=True,
    seed=None,
    validation_split=None,
    subset=None,
    follow_links=False
)

In [None]:
from keras.layers import Input, TextVectorization
from keras.models import Model

def remover_label(x,label):
    return x

vocab_size = 10000
vectorize_layer = TextVectorization(max_tokens=vocab_size, output_sequence_length=256)
vectorize_layer.adapt(dataset_train.map(remover_label))

In [None]:
def convolve_and_downsample(input_n_samples, input_embedding_size, n_filters, kernel_size=3, **kwargs):
    input_layer = Input(shape=(input_n_samples,input_embedding_size))
    x = input_layer
    x = Conv1D( filters=n_filters,
                kernel_size=kernel_size,
                padding='same',
                use_bias=False,
                )(x)
    x = AveragePooling1D(pool_size=2)(x)
    x = Activation('elu')(x)
    return Model(input_layer, x, **kwargs)

def deep_cnn_embedding_softmax_model(vectorize_layer, vocab_size=vocab_size, number_of_ngrams=16, n_gram_size=3):
    input_layer = Input(shape=(1,), dtype=tf.string)
    x = input_layer
    x = vectorize_layer(x)
    x = Embedding(vocab_size, 2, name='projecao')(x)
    x = convolve_and_downsample(256, 2, number_of_ngrams, n_gram_size, name='ngramas')(x)
    x = convolve_and_downsample(128, number_of_ngrams, number_of_ngrams, n_gram_size)(x)
    x = convolve_and_downsample(64, number_of_ngrams, number_of_ngrams, n_gram_size)(x)
    x = convolve_and_downsample(32, number_of_ngrams, number_of_ngrams, n_gram_size)(x)
    x = convolve_and_downsample(16, number_of_ngrams, number_of_ngrams, n_gram_size)(x)
    x = convolve_and_downsample(8, number_of_ngrams, number_of_ngrams, n_gram_size)(x)
    x = convolve_and_downsample(4, number_of_ngrams, number_of_ngrams, n_gram_size)(x)
    x = convolve_and_downsample(2, number_of_ngrams, number_of_ngrams, n_gram_size)(x)
    x = Reshape( (-1,))(x)
    x = Dense(4, name='classificador')(x)
    x = Activation('softmax')(x)
    return Model(input_layer, x)

clf = deep_cnn_embedding_softmax_model(vectorize_layer)
print(clf.summary())
clf.compile(loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
start_time = time.time()

history = clf.fit(dataset_train, epochs=30, verbose=1, validation_data=dataset_test)

end_time = time.time()

elapsed_time = end_time - start_time

print("Time taken by the pipeline: {:.2f} seconds".format(elapsed_time))

In [None]:
# def convolve_and_downsample(input_n_samples, input_embedding_size, n_filters, kernel_size=3, **kwargs):
#     input_layer = Input(shape=(input_n_samples,input_embedding_size))
#     x = input_layer
#     x = Conv1D( filters=n_filters,
#                 kernel_size=kernel_size,
#                 padding='same',
#                 use_bias=False,
#                 )(x)
#     x = AveragePooling1D(pool_size=2)(x)
#     x = Activation('elu')(x)
#     return Model(input_layer, x, **kwargs)

# def deep_cnn_embedding_softmax_model(vectorize_layer, vocab_size=vocab_size, number_of_ngrams=16, n_gram_size=3):
#     input_layer = Input(shape=(1,), dtype=tf.string)
#     x = input_layer
#     x = vectorize_layer(x)
#     x = Embedding(vocab_size, 2, name='projecao')(x)
#     x = convolve_and_downsample(256, 2, number_of_ngrams, n_gram_size, name='ngramas')(x)
#     x = convolve_and_downsample(128, number_of_ngrams, number_of_ngrams, n_gram_size)(x)
#     x = convolve_and_downsample(64, number_of_ngrams, number_of_ngrams, n_gram_size)(x)
#     x = convolve_and_downsample(32, number_of_ngrams, number_of_ngrams, n_gram_size)(x)
#     x = convolve_and_downsample(16, number_of_ngrams, number_of_ngrams, n_gram_size)(x)
#     x = convolve_and_downsample(8, number_of_ngrams, number_of_ngrams, n_gram_size)(x)
#     x = convolve_and_downsample(4, number_of_ngrams, number_of_ngrams, n_gram_size)(x)
#     x = convolve_and_downsample(2, number_of_ngrams, number_of_ngrams, n_gram_size)(x)
#     x = Reshape( (-1,))(x)
#     x = Dense(4, name='classificador')(x)
#     x = Activation('softmax')(x)
#     return Model(input_layer, x)

# for i in range(0,10):
#     clf = deep_cnn_embedding_softmax_model(vectorize_layer)
#     print(clf.summary())
#     clf.compile(loss='categorical_crossentropy', metrics=['accuracy'])
    
#     start_time = time.time()

#     history = clf.fit(dataset_train, epochs=30, verbose=1, validation_data=dataset_test)

#     end_time = time.time()

#     elapsed_time = end_time - start_time

#     print("Time taken by the pipeline: {:.2f} seconds".format(elapsed_time))

In [None]:
# clf.evaluate(dataset_test)
# clf.save('NLP_AF_B')
    
from tensorflow import keras
clf = keras.models.load_model('NLP_AF_B')
# clf.evaluate(dataset_test)

In [None]:
plt.figure(figsize=(14,1))
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.show()
plt.figure(figsize=(14,1))
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.legend()
plt.show()

#### 2C - Abordagem com Deep Learning  utilizando rede pré-treinada

In [None]:
# Item 2C
from tqdm import tqdm
from transformers import BertTokenizer, TFBertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained("bert-base-uncased")

# max_len = 1000

# def embed_text(text):
#     response = tokenizer(text, truncation = True, padding = True, return_tensors='tf')
#     return model(response)[0][:,0,:]

# # Apply the embedding function to the 'text_column' in your DataFrame
# list_embed = [0]*df.shape[0]
# for index, row in tqdm(df[74682:].iterrows()):
#     text = str(row['text'])
#     embeded_text = embed_text(text).numpy()[0]
#     list_embed[index] = embeded_text  

In [None]:
# np.save('bert_array_74682_end', list_embed)

In [None]:
# classificador = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=10000)

# classificador.fit(list_embed[0:int(max_len*0.6)],y_train[0:int(max_len*0.6)])
# y_pred = classificador.predict(list_embed[int(max_len*0.6):max_len])
# acc = accuracy_score(y_pred,y_train[int(max_len*0.6):max_len])
# acc

In [None]:
import numpy as np
bert_array_1 = np.load('bert_array_24003.npy', allow_pickle=True)[0:24004]
bert_array_2 = np.load('bert_array_24004_74682.npy', allow_pickle=True)[24004:74682]
bert_array_3 = np.load('bert_array_74682_end.npy', allow_pickle=True)[:1000]
bert_array = list(np.concatenate((bert_array_1, bert_array_2,bert_array_3)))

In [None]:
y = np.array(df['sentiment'])

classificador = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=10000)

X_train, X_test, y_train, y_test= train_test_split(bert_array, y, train_size=0.70)

#lower_bound = int(0.6*len(bert_array))

start_time = time.time()

classificador.fit(X_train,y_train)
y_pred = classificador.predict(X_test)
acc = accuracy_score(y_pred,y_test)
print(acc)

end_time = time.time()

elapsed_time = end_time - start_time

print("Time taken by the pipeline: {:.2f} seconds".format(elapsed_time))

In [None]:
X_test[0]

In [None]:
start_time = time.time()

y_pred = classificador.predict([X_test[0]])

end_time = time.time()

elapsed_time = end_time - start_time

print("Time taken by the pipeline: {:.2f} seconds".format(elapsed_time))

#### 2D - Abordagem com rede pré treinada com mínimo de pós processamento

In [None]:
# 2D - Rede pré treinada com mínimo de pós processamento
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
 
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# TF
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)


In [None]:
def roberta_twitter_sentiment(text):
    encoded_input = tokenizer(text, return_tensors='tf')
    output = model(encoded_input)
    scores = output[0][0].numpy()
    scores = softmax(scores)

    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    
#     for i in range(scores.shape[0]):
#         l = labels[ranking[i]]
#         s = scores[ranking[i]]
#         print(f"{i+1}) {l} {np.round(float(s), 4)}")
    
    scores = scores.tolist()
    max_index = scores.index(max(scores))
    return "{0}".format(labels[max_index])

roberta_twitter_sentiment('Enjoy!')

In [None]:
roberta_list = []
max_len = 500
for index,row in tqdm(df.iterrows()):
    roberta_list.append(roberta_twitter_sentiment(str(row['text'])))

In [None]:
np.save('roberta_list', roberta_list)

In [None]:
def change_values(array):
    for i in range(len(array)):
        if array[i] == "Irrelevant":
            array[i] = "Neutral"
        if array[i] == "positive":
            array[i] = "Positive"
        if array[i] == "negative":
            array[i] = "Negative"
        if array[i] == "neutral":
            array[i] = "Neutral"
    return array

y = np.array(df['sentiment'])

X_train, X_test, y_train, y_test= train_test_split(roberta_list, y, train_size=0.70)

y_test_roberta = change_values(y_test)
roberta_list = change_values(roberta_list)

In [None]:
acc = accuracy_score(roberta_list, y)
acc

### API 