In [None]:
# nltk imports
from nltk.tokenize import word_tokenize  # tokenize the text == the text is splitted into words in list
from nltk.corpus import stopwords  # this contain common stop words that has no effect in analysis
from nltk.stem import WordNetLemmatizer  # Lemmatization is the process of grouping together the different inflected forms of a word so they can be analyzed as a single item

# sklearn imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  # bags of words and TF IDF
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix, make_scorer  # classification Metrics
from sklearn.naive_bayes import MultinomialNB  # Multiclassification
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm
from sklearn.model_selection import StratifiedKFold  # For stratified splitting (helpful in imbalanced data)
from sklearn.preprocessing import LabelBinarizer  # for Categorical features
from sklearn.model_selection import GridSearchCV  # for tuning parameters
from sklearn.model_selection import train_test_split  # splitting dataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn import pipeline
from sklearn import linear_model

# gensim imports
from gensim.models import KeyedVectors  # to save and load vectors
import string
import re
import pandas as pd
import numpy as np
import xgboost as xgb
import catboost as cbt

# tensorflow and keras
import keras
from keras import backend as K
from tensorflow.keras.layers import Embedding
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence, text
from keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.utils import np_utils
from keras.callbacks import EarlyStopping , ReduceLROnPlateau
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

# Hugging Face Transformers
from transformers import (pipeline , BertTokenizer,
                          TFBertForSequenceClassification,
                          InputExample, InputFeatures , 
                         AutoTokenizer, TFAutoModelForSequenceClassification,
                         TFRobertaModel, TFGPT2Model, RobertaTokenizer, GPT2Tokenizer)


In [None]:
df = pd.read_csv('/kaggle/input/tweets-new-data/tweet_and_emotion.csv')

df.head()

In [None]:
#df.drop('tweet_id',axis =1, inplace= True)
df.columns = [ 'Sentence','Sentiment']

In [None]:
df.info()

In [None]:
df.Sentiment.value_counts().plot.bar()

In [None]:
# mapping to the sentiment column 

dicto = {'anger': 0,'fear': 1, 'joy':2, 'sadness':3, 'neutral':4}
df = df[df.Sentiment.isin(dicto.keys())]
df.Sentiment = df.Sentiment.map(dicto)
df.Sentence = df.Sentence.apply(lambda x: re.sub(r'(@\S+)|(http\S+)|(www\.\S+)', '', x))

In [None]:
# loading pretrained google news word2vec embedding 300D
word2vec_pretrained = KeyedVectors.load_word2vec_format("../input/googles-trained-word2vec-model-in-python/GoogleNews-vectors-negative300.bin",binary=True)
word2vec_pretrained_dict = dict(zip(word2vec_pretrained.key_to_index.keys(),
                                    word2vec_pretrained.vectors))

In [None]:
list(word2vec_pretrained_dict.values())[0].shape

In [None]:

df['Sentence'] = df['Sentence'].apply(process_text)

X_train,X_test,y_train,y_test = train_test_split(df.Sentence, df.Sentiment, test_size = 0.2,
                                                 random_state = 42, stratify= df.Sentiment, shuffle = True)
y_train_enc = np_utils.to_categorical(y_train, 5)
y_test_enc = np_utils.to_categorical(y_test, 5)

In [None]:
token = tf.keras.preprocessing.text.Tokenizer(num_words=None)

token.fit_on_texts(list(X_train) + list(X_test)) # fits tokens on texts
xtrain_seq = token.texts_to_sequences(X_train) # text to sequences converts the sentence words to number sequences
xtest_seq = token.texts_to_sequences(X_test)

#zero pad sequences
xtrain_pad = pad_sequences(xtrain_seq,padding='post') # zero padding all sentences to have the same shape as the largest one
xtest_pad = pad_sequences(xtest_seq,padding='post')

word_index = token.word_index # returns the word index that have been tokenized


In [None]:
#create embedding matrix for words that we have in dataset

embedding_matrix = np.zeros((len(word_index)+1, 300))
for word,i in word_index.items():
    embedding_vector = word2vec_pretrained_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        

In [None]:
# Build Custom Metrics (F1-Score)


def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m_():
    def f1_m(y_true, y_pred):
        precision = precision_m(y_true, y_pred)
        recall = recall_m(y_true, y_pred)
        return 2*((precision*recall)/(precision+recall+K.epsilon()))
    return f1_m


In [None]:
model_bi = Sequential()
model_bi.add(Embedding(len(word_index)+1, 300, weights=[embedding_matrix], trainable = False))

model_bi.add(SpatialDropout1D(0.3))
model_bi.add(Bidirectional(LSTM(300, dropout = 0.3, recurrent_dropout = 0.3)))

model_bi.add(Dense(1024, activation = 'relu'))
model_bi.add(Dropout(0.8))


model_bi.add(Dense(1024, activation = 'relu'))
model_bi.add(Dropout(0.8))

model_bi.add(Dense(5))
model_bi.add(Activation('softmax'))
model_bi.summary()

model_bi.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = f1_m_())

earlystop = EarlyStopping(monitor = 'val_loss', min_delta = 0, patience = 15, verbose = 1, mode = 'auto')
history = model_bi.fit(xtrain_pad, y=y_train_enc, batch_size = 128, epochs = 100, verbose=1,
                       validation_data = (xtest_pad, y_test_enc),callbacks = [earlystop])





In [None]:
y_pred = model_bi.predict(xtest_pad)
print(classification_report(np.argmax(y_test_enc, axis=1), np.argmax(y_pred , axis=1)))