# Tweet Emotions Analysis (12 emotions) <a class="anchor" id="tea"></a>

<a href="https://www.linkedin.com/in/ouassim-adnane/">Ouassim Adnane</a> 08 June 2020

<img src="https://www.feelingfacescards.com/images/feeling_faces_chart_poster.jpg" />

In [None]:
!pip install tweet-preprocessor 2>/dev/null 1>/dev/null

In [None]:
!pip install pyyaml h5py  2>/dev/null 1>/dev/null

In [None]:
import preprocessor as p
import numpy as np 
import pandas as pd 
import emoji
import keras
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
import plotly.graph_objects as go
import plotly.express as px
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from tqdm import tqdm

# Data preparation  <a class="anchor" id="dp"></a>
<a href="#toc"><img src= "https://upload.wikimedia.org/wikipedia/commons/thumb/2/20/Circle-icons-arrow-up.svg/1200px-Circle-icons-arrow-up.svg.png" style="width:20px;hight:20px;float:left" >Back to the table of contents</a>

### Misspelled data <a class="anchor" id="dp-md"></a>
<a href="#toc"><img src= "https://upload.wikimedia.org/wikipedia/commons/thumb/2/20/Circle-icons-arrow-up.svg/1200px-Circle-icons-arrow-up.svg.png" style="width:20px;hight:20px;float:left" >Back to the table of contents</a>

In [None]:
misspell_data = pd.read_csv("/kaggle/input/spelling/aspell.txt",sep=":",names=["correction","misspell"])
misspell_data.misspell = misspell_data.misspell.str.strip()
misspell_data.misspell = misspell_data.misspell.str.split(" ")
misspell_data = misspell_data.explode("misspell").reset_index(drop=True)
misspell_data.drop_duplicates("misspell",inplace=True)
miss_corr = dict(zip(misspell_data.misspell, misspell_data.correction))

#Sample of the dict
{v:miss_corr[v] for v in [list(miss_corr.keys())[k] for k in range(20)]}

In [None]:
def misspelled_correction(val):
    for x in val.split(): 
        if x in miss_corr.keys(): 
            val = val.replace(x, miss_corr[x]) 
    return val



In [None]:
print(misspelled_correction('abouy => about'))

### Contractions <a class="anchor" id="dp-c"></a>
<a href="#toc"><img src= "https://upload.wikimedia.org/wikipedia/commons/thumb/2/20/Circle-icons-arrow-up.svg/1200px-Circle-icons-arrow-up.svg.png" style="width:20px;hight:20px;float:left" >Back to the table of contents</a>

In [None]:
contractions = pd.read_csv("/kaggle/input/contractions/contractions.csv")
cont_dic = dict(zip(contractions.Contraction, contractions.Meaning))

In [None]:
print(contractions.head())

In [None]:
def cont_to_meaning(val): 
  
    for x in val.split(): 
        if x in cont_dic.keys(): 
            val = val.replace(x, cont_dic[x]) 
    return val


In [None]:
print("I'm => I am")

### Remove URLS and mentions <a class="anchor" id="dp-r"></a>
<a href="#toc"><img src= "https://upload.wikimedia.org/wikipedia/commons/thumb/2/20/Circle-icons-arrow-up.svg/1200px-Circle-icons-arrow-up.svg.png" style="width:20px;hight:20px;float:left" >Back to the table of contents</a>

In [None]:
p.set_options(p.OPT.MENTION, p.OPT.URL)
p.clean("hello guys @alx #sport🔥 1245 https://github.com/s/preprocessor")

### Punctuations and emojis <a class="anchor" id="dp-p"></a>
<a href="#toc"><img src= "https://upload.wikimedia.org/wikipedia/commons/thumb/2/20/Circle-icons-arrow-up.svg/1200px-Circle-icons-arrow-up.svg.png" style="width:20px;hight:20px;float:left" >Back to the table of contents</a>

In [None]:
def punctuation(val): 
  
    punctuations = '''()-[]{};:'"\,<>./@#$%^&_~'''
  
    for x in val.lower(): 
        if x in punctuations: 
            val = val.replace(x, " ") 
    return val


In [None]:
punctuation("test @ #ldfldlf??? !! ")

In [None]:
def clean_text(val):
    val = misspelled_correction(val)
    val = cont_to_meaning(val)
    val = p.clean(val)
    val = ' '.join(punctuation(emoji.demojize(val)).split())
    
    return val

In [None]:
clean_text("isn't 💡 adultry @ttt good bad ... ! ? ")

# Modeling  <a class="anchor" id="m"></a>
<a href="#toc"><img src= "https://upload.wikimedia.org/wikipedia/commons/thumb/2/20/Circle-icons-arrow-up.svg/1200px-Circle-icons-arrow-up.svg.png" style="width:20px;hight:20px;float:left" >Back to the table of contents</a>

### Encoding the data and train test split <a class="anchor" id="m-ed"></a>
<a href="#toc"><img src= "https://upload.wikimedia.org/wikipedia/commons/thumb/2/20/Circle-icons-arrow-up.svg/1200px-Circle-icons-arrow-up.svg.png" style="width:20px;hight:20px;float:left" >Back to the table of contents</a>

In [None]:
test_list = [1,7,10,7]

print("original list", test_list)

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(test_list)

print("after label encoder fit_transform", integer_encoded)

onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
Y = onehot_encoder.fit_transform(integer_encoded)

print("After onehot encoder fit_transform\n", Y)


In [None]:
def get_Y(data):
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(data)

    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    Y = onehot_encoder.fit_transform(integer_encoded)
    return Y

In [None]:
train_text = pd.read_csv('/kaggle/input/tweet-emotion/emotion/train_text.txt', sep='\n', names=['text'])
train_label = pd.read_csv('/kaggle/input/tweet-emotion/emotion/train_labels.txt', sep='\n', names=['label'])
train = pd.DataFrame()
train['text'] = train_text.text
train['label'] = train_label.label

test_text = pd.read_csv('/kaggle/input/tweet-emotion/emotion/test_text.txt', sep='\n', names=['text'])
test_label = pd.read_csv('/kaggle/input/tweet-emotion/emotion/test_labels.txt', sep='\n', names=['label'])
test = pd.DataFrame()
test['text'] = test_text.text
test['label'] = test_label.label

# clean data
train.text = train.text.apply(clean_text)
test.text = test.text.apply(clean_text)

# drop the data which is optimism
train = train[train.label != 2]
test = test[test.label != 2]


X_train = train.text
X_test = test.text
y_train = get_Y(train.label)
y_test = get_Y(test.label)



max_len = 252
Epoch = 15
batch_size = 32

In [None]:
print(X_train, X_test, y_train, y_test)

### Roberta Base Model <a class="anchor" id="m-rb"></a>
<a href="#toc"><img src= "https://upload.wikimedia.org/wikipedia/commons/thumb/2/20/Circle-icons-arrow-up.svg/1200px-Circle-icons-arrow-up.svg.png" style="width:20px;hight:20px;float:left" >Back to the table of contents</a>

In [None]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

def build_model(transformer, max_len=160):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(3, activation='softmax')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
MODEL = 'roberta-base'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
X_train_t = regular_encode(X_train, tokenizer, maxlen=max_len)
X_test_t = regular_encode(X_test, tokenizer, maxlen=max_len)

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_train_t, y_train))
    .repeat()
    .shuffle(1995)
    .batch(batch_size)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_test_t, y_test))
    .batch(batch_size)
    .cache()
    .prefetch(AUTO)
)


In [None]:
transformer_layer = TFAutoModel.from_pretrained(MODEL)
model_roberta_base = build_model(transformer_layer, max_len=max_len)
history = model_roberta_base.summary()

In [None]:
import os
if 'training' not in os.listdir('.'):
    os.mkdir('training')
checkpoint_path = "training/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)
os.listdir('.')
os.listdir(checkpoint_dir)

In [None]:
n_steps = X_train.shape[0] // batch_size
history = model_roberta_base.fit(train_dataset,steps_per_epoch=n_steps,validation_data=valid_dataset,epochs=Epoch, callbacks=[cp_callback])

In [None]:
print(history.history)

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.legend()
plt.title('Loss')
plt.xlabel('Epoce')
plt.ylabel('loss')

In [None]:
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='validation')
plt.legend()
plt.title('Accuracy')
plt.xlabel('Epoce')
plt.ylabel('acc')

### Test Roberta Model Results <a class="anchor" id="m-rbr"></a>
<a href="#toc"><img src= "https://upload.wikimedia.org/wikipedia/commons/thumb/2/20/Circle-icons-arrow-up.svg/1200px-Circle-icons-arrow-up.svg.png" style="width:20px;hight:20px;float:left" >Back to the table of contents</a>

In [None]:
sent_to_id = {'anger': 0, 'joy':1, 'sadness': 2}
def get_sentiment2(model,text):
    text = clean_text(text)
    #tokenize
    x_test1 = regular_encode([text], tokenizer, maxlen=max_len)
    test1 = (tf.data.Dataset.from_tensor_slices(x_test1).batch(1))
    #test1
    sentiment = model.predict(test1,verbose = 0)
    sent = np.round(np.dot(sentiment,100).tolist(),0)[0]
    result = pd.DataFrame([sent_to_id.keys(),sent]).T
    result.columns = ["sentiment","percentage"]
    result=result[result.percentage !=0]
    return result

def plot_result(result):
    print(result)

In [None]:
result =get_sentiment2(model_roberta_base,"Had an absolutely brilliant day ðŸ˜ loved seeing an old friend and reminiscing")
plot_result(result)
result =get_sentiment2(model_roberta_base,"The pain my heart feels is just too much for it to bear. Nothing eases this pain. I can’t hold myself back. I really miss you")
plot_result(result)
result =get_sentiment2(model_roberta_base,"I hate this game so much,It make me angry all the time ")
plot_result(result)