# **Installing and Importing Libraries**

## **Installing Necessary Packages**
This part can change for each user depending upon environment

In [None]:
pip install gensim

In [None]:
!pip install tensorflow

In [None]:
!pip install beautifulsoup4 

In [None]:
!pip install emoji

In [None]:
!pip install transformers

In [None]:
!pip install scikit-multilearn

In [None]:
!pip install bert-tensorflow

## **Importing Libraries**
All the libraries which have been utlized in this work are imported in this block of code.

In [None]:
import os
import re
import string
import json
import emoji
import numpy as np
import pandas as pd
import nltk
import transformers
import torch
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
#import bert
import tensorflow as tf

from numpy import array
from numpy import asarray
from numpy import zeros

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from yellowbrick.text import FreqDistVisualizer

from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM, GlobalMaxPooling1D, Input
from keras.models import Model
from keras.layers.embeddings import Embedding
from keras.layers.merge import Concatenate
from keras.utils.vis_utils import plot_model

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, hamming_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from skmultilearn.problem_transform import LabelPowerset

from bs4 import BeautifulSoup

from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModel, AdamW

from wordcloud import WordCloud

from IPython.display import Image

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

# **Data Preprocessing**

In [None]:
df_train = pd.read_csv("../input/d/sarfrazahmad307/emotions/train.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])
df_dev = pd.read_csv("../input/d/sarfrazahmad307/emotions/dev.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])
df_test = pd.read_csv("../input/d/sarfrazahmad307/emotions/test.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])

In [None]:
df_train.head()

In [None]:
print(df_train.shape)
print(df_dev.shape)
print(df_test.shape)

In [None]:
df_train['List of classes'] = df_train['Class'].apply(lambda x: x.split(','))
df_train['Len of classes'] = df_train['List of classes'].apply(lambda x: len(x))
df_dev['List of classes'] = df_dev['Class'].apply(lambda x: x.split(','))
df_dev['Len of classes'] = df_dev['List of classes'].apply(lambda x: len(x))
df_test['List of classes'] = df_test['Class'].apply(lambda x: x.split(','))
df_test['Len of classes'] = df_test['List of classes'].apply(lambda x: len(x))

In [None]:
df_train.head()

In [None]:
with open('../input/d/sarfrazahmad307/emotions/ekman_mapping.json') as file:
    ekman_mapping = json.load(file)
    
ekman_mapping

In [None]:
emotion_file = open("../input/d/sarfrazahmad307/emotions/emotions.txt", "r")
emotion_list = emotion_file.read()
emotion_list = emotion_list.split("\n")
print(emotion_list)

In [None]:
def idx2class(idx_list):
    arr = []
    for i in idx_list:
        arr.append(emotion_list[int(i)])
    return arr

In [None]:
df_train['Emotions'] = df_train['List of classes'].apply(idx2class)
df_dev['Emotions'] = df_dev['List of classes'].apply(idx2class)
df_test['Emotions'] = df_test['List of classes'].apply(idx2class)

In [None]:
df_train.head()

In [None]:
def EmotionMapping(emotion_list):
    map_list = []
    
    for i in emotion_list:
        if i in ekman_mapping['anger']:
            map_list.append('anger')
        if i in ekman_mapping['disgust']:
            map_list.append('disgust')
        if i in ekman_mapping['fear']:
            map_list.append('fear')
        if i in ekman_mapping['joy']:
            map_list.append('joy')
        if i in ekman_mapping['sadness']:
            map_list.append('sadness')
        if i in ekman_mapping['surprise']:
            map_list.append('surprise')
        if i == 'neutral':
            map_list.append('neutral')
            
    return map_list

In [None]:
df_train['Mapped Emotions'] = df_train['Emotions'].apply(EmotionMapping)
df_dev['Mapped Emotions'] = df_dev['Emotions'].apply(EmotionMapping)
df_test['Mapped Emotions'] = df_test['Emotions'].apply(EmotionMapping)

In [None]:
df_train.head()

In [None]:
df_train['anger'] = np.zeros((len(df_train),1))
df_train['disgust'] = np.zeros((len(df_train),1))
df_train['fear'] = np.zeros((len(df_train),1))
df_train['joy'] = np.zeros((len(df_train),1))
df_train['sadness'] = np.zeros((len(df_train),1))
df_train['surprise'] = np.zeros((len(df_train),1))
df_train['neutral'] = np.zeros((len(df_train),1))

df_dev['anger'] = np.zeros((len(df_dev),1))
df_dev['disgust'] = np.zeros((len(df_dev),1))
df_dev['fear'] = np.zeros((len(df_dev),1))
df_dev['joy'] = np.zeros((len(df_dev),1))
df_dev['sadness'] = np.zeros((len(df_dev),1))
df_dev['surprise'] = np.zeros((len(df_dev),1))
df_dev['neutral'] = np.zeros((len(df_dev),1))

df_test['anger'] = np.zeros((len(df_test),1))
df_test['disgust'] = np.zeros((len(df_test),1))
df_test['fear'] = np.zeros((len(df_test),1))
df_test['joy'] = np.zeros((len(df_test),1))
df_test['sadness'] = np.zeros((len(df_test),1))
df_test['surprise'] = np.zeros((len(df_test),1))
df_test['neutral'] = np.zeros((len(df_test),1))

In [None]:
for i in ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise','neutral']:
    df_train[i] = df_train['Mapped Emotions'].apply(lambda x: 1 if i in x else 0)
    df_dev[i] = df_dev['Mapped Emotions'].apply(lambda x: 1 if i in x else 0)
    df_test[i] = df_test['Mapped Emotions'].apply(lambda x: 1 if i in x else 0)

In [None]:
df_train.head()

In [None]:
df_dev.head()

In [None]:
df_test.head()

In [None]:
print(df_train.shape)
print(df_dev.shape)
print(df_test.shape)

In [None]:
df_train.drop(df_train[df_train['neutral'] == 1].index, inplace=True)
df_dev.drop(df_dev[df_dev['neutral'] == 1].index, inplace=True)
df_test.drop(df_test[df_test['neutral'] == 1].index, inplace=True)
df_train.drop(df_train[df_train['disgust'] == 1].index, inplace=True)
df_dev.drop(df_dev[df_dev['disgust'] == 1].index, inplace=True)
df_test.drop(df_test[df_test['disgust'] == 1].index, inplace=True)

In [None]:
print(df_train.shape)
print(df_dev.shape)
print(df_test.shape)

In [None]:
df_train.drop(['Class', 'List of classes', 'disgust', 'neutral', 'Len of classes', 'Emotions', 'Mapped Emotions'], axis=1, inplace=True)
df_dev.drop(['Class', 'List of classes', 'disgust', 'neutral', 'Len of classes', 'Emotions', 'Mapped Emotions'], axis=1, inplace=True)
df_test.drop(['Class', 'List of classes', 'disgust', 'neutral', 'Len of classes', 'Emotions', 'Mapped Emotions'], axis=1, inplace=True)

In [None]:
df_train.head()

In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", 
                       "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", 
                       "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", 
                       "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am",
                       "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", 
                       "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
                       "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not",
                       "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", 
                       "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                       "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", 
                       "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
                       "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is",
                       "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would",
                       "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have",
                       "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
                       "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", 
                       "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did",
                       "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", 
                       "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", 
                       "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                       "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have",
                       "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'u.s':'america', 'e.g':'for example'}

punct = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-",
                 "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 
                 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', '!':' '}

mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater',
                'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ',
                'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can',
                'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 
                'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 
                'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 
                'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization',
                'demonetisation': 'demonetization'}

In [None]:
def clean_text(text):
    '''Clean emoji, Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = emoji.demojize(text)
    text = re.sub(r'\:(.*?)\:','',text)
    text = str(text).lower()    #Making Text Lowercase
    text = re.sub('\[.*?\]', '', text)
    #The next 2 lines remove html text
    text = BeautifulSoup(text, 'lxml').get_text()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",", "'")
    text = re.sub(r"[^a-zA-Z?.!,¿']+", " ", text)
    return text

def clean_contractions(text, mapping):
    '''Clean contraction using contraction mapping'''    
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    for word in mapping.keys():
        if ""+word+"" in text:
            text = text.replace(""+word+"", ""+mapping[word]+"")
    #Remove Punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    text = re.sub(r"([?.!,¿])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    return text

def clean_special_chars(text, punct, mapping):
    '''Cleans special characters present(if any)'''   
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

def correct_spelling(x, dic):
    '''Corrects common spelling errors'''   
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x

def remove_space(text):
    '''Removes awkward spaces'''   
    #Removes awkward spaces 
    text = text.strip()
    text = text.split()
    return " ".join(text)

def text_preprocessing_pipeline(text):
    '''Cleaning and parsing the text.'''
    text = clean_text(text)
    text = clean_contractions(text, contraction_mapping)
    text = clean_special_chars(text, punct, punct_mapping)
    text = correct_spelling(text, mispell_dict)
    text = remove_space(text)
    return text

In [None]:
df_train['Text'] = df_train['Text'].apply(text_preprocessing_pipeline)
df_dev['Text'] = df_dev['Text'].apply(text_preprocessing_pipeline)
df_test['Text'] = df_test['Text'].apply(text_preprocessing_pipeline)

In [None]:
df_train.reset_index(drop=True).to_csv("train.csv", index=False)
df_dev.reset_index(drop=True).to_csv("val.csv", index=False)
df_test.reset_index(drop=True).to_csv("test.csv", index=False)

In [None]:
df_train = df_train.reset_index(drop=True)
df_dev = df_dev.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
df_train.head()

In [None]:
print(df_train.shape)
print(df_dev.shape)
print(df_test.shape)

In [None]:
e = ['anger', 'fear', 'joy', 'sadness', 'surprise']
max(df_train[e].sum(axis=1))  # The highest number of labels for a sample

In [None]:
temp_df = df_train[e].sum(axis=1).value_counts().sort_index()

trace1 = go.Bar(
                x = ['1','2','3','4','5'],
                y = temp_df.tolist(),
                marker = dict(color = 'rgb(250,13,92)',
                              line=dict(color='rgb(0,0,0)',width=1.5)),
                text=temp_df.tolist(), textposition='outside',
                width=[0.5, 0.5, 0.5, 0.5, 0.5])
layout = go.Layout(template= "plotly_dark",title = 'Number of classes' , xaxis = dict(title = 'Class Numbers'), yaxis = dict(title = 'Count'))
fig = go.Figure(data = [trace1], layout = layout)
fig.show()

In [None]:
temp_list = df_train.drop(['Text', 'ID'], axis=1).sum(axis=0).tolist()
trace1 = go.Bar(
                x = e,
                y = temp_list,
                marker = dict(color = 'rgb(127, 16, 238)',
                              line=dict(color='rgb(0,0,0)',width=1.5)),
                text=temp_list, textposition='outside')
layout = go.Layout(template= "plotly_dark",title = 'NUMBER OF EKMAN-EMOTIONS' , xaxis = dict(title = 'Emotion'), yaxis = dict(title = 'Count'))
fig = go.Figure(data = [trace1], layout = layout)
fig.show()

In [None]:
df_train.head()

In [None]:
len(df_train[df_train['surprise']==1])

# **BERT**

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 64
TEST_BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 2e-5
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [None]:
target_cols = [col for col in df_train.columns if col not in ['Text', 'ID']]
target_cols

In [None]:
class BERTDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.text = df.Text
        self.tokenizer = tokenizer
        self.targets = df[target_cols].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
train_dataset = BERTDataset(df_train, tokenizer, MAX_LEN)
valid_dataset = BERTDataset(df_dev, tokenizer, MAX_LEN)
test_dataset = BERTDataset(df_test, tokenizer, MAX_LEN)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, 
                          num_workers=4, shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, 
                          num_workers=4, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE, 
                          num_workers=4, shuffle=False, pin_memory=True)

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.roberta = AutoModel.from_pretrained('roberta-base')
#         self.l2 = torch.nn.Dropout(0.3)
        self.fc = torch.nn.Linear(768,5)
    
    def forward(self, ids, mask, token_type_ids):
        _, features = self.roberta(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output_1)
        output = self.fc(features)
        return output

model = BERTClass()
model.to(device);

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
optimizer = AdamW(params =  model.parameters(), lr=LEARNING_RATE, weight_decay=1e-6)

In [None]:
def train(epoch):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _%500 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

In [None]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(valid_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation()
outputs = np.array(outputs) >= 0.5
accuracy = accuracy_score(targets, outputs)
f1_score_micro = f1_score(targets, outputs, average='micro')
f1_score_macro = f1_score(targets, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

# **Machine Learning-Based Multi-Label Classifiers**

In [None]:
stop = stopwords.words('english')

def remove_stopwords(text):
    text = text.replace("\n", " ")
    tokens = nltk.tokenize.word_tokenize(text.lower()) # split string into words (tokens)
    tokens = [t for t in tokens if t not in stop] # remove stopwords
    cleanedText = " ".join(tokens)
    return cleanedText

def dataCleaning(df):
    data = df.copy()
    data = data.apply(remove_stopwords)
    return data

In [None]:
y_train =  df_train.drop(['Text', 'ID'], axis=1)
y_val = df_dev.drop(['Text', 'ID'], axis=1)
y_test = df_test.drop(['Text', 'ID'], axis=1)
y_train.head()

In [None]:
X_train = df_train['Text']
X_val = df_dev['Text']
X_test = df_test['Text']
X_train.head()

In [None]:
X_train = dataCleaning(X_train)
X_val = dataCleaning(X_val)
X_test = dataCleaning(X_test)
X_train.head()

In [None]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(X_test)

In [None]:
features = vectorizer.get_feature_names()
visualizer = FreqDistVisualizer(features=features, orient='v', size=(1080, 720))
visualizer.fit(X_train_vec)
visualizer.show()

In [None]:
Models_acc = {}
Models_micro = {}
Models_macro = {}

def metricsReport(modelName, test_labels, predictions):
    accuracy = accuracy_score(test_labels, predictions)

    macro_precision = precision_score(test_labels, predictions, average='macro')
    macro_recall = recall_score(test_labels, predictions, average='macro')
    macro_f1 = f1_score(test_labels, predictions, average='macro')

    micro_precision = precision_score(test_labels, predictions, average='micro')
    micro_recall = recall_score(test_labels, predictions, average='micro')
    micro_f1 = f1_score(test_labels, predictions, average='micro')

    hamLoss = hamming_loss(test_labels, predictions)
    
    print("------" + modelName + " Model Metrics-----")
    print("Accuracy: {:.4f}\nHamming Loss: {:.4f}\nPrecision:\n  - Macro: {:.4f}\n  - Micro: {:.4f}\nRecall:\n  - Macro: {:.4f}\n  - Micro: {:.4f}\nF1-measure:\n  - Macro: {:.4f}\n  - Micro: {:.4f}"\
          .format(accuracy, hamLoss, macro_precision, micro_precision, macro_recall, micro_recall, macro_f1, micro_f1))
    
    Models_acc[modelName] = accuracy
    Models_micro[modelName] = micro_f1
    Models_macro[modelName] = macro_f1

## **KNN**

In [None]:
knn = KNeighborsClassifier()

knn.fit(X_train_vec, y_train)
knnPredictions = knn.predict(X_test_vec)
metricsReport("knn", y_test, knnPredictions)

## **Decision Tree**

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train_vec, y_train)
dtPreds = dt.predict(X_test_vec)
metricsReport("Decision Tree", y_test, dtPreds)

## **Bagging**

In [None]:
bag = OneVsRestClassifier(BaggingClassifier(n_jobs=-1))
bag.fit(X_train_vec, y_train)
bagPreds = bag.predict(X_test_vec)
metricsReport("Bagging", y_test, bagPreds)

## **Random Forest**

In [None]:
rf = RandomForestClassifier(n_jobs=-1)
rf.fit(X_train_vec, y_train)
rfPreds = rf.predict(X_test_vec)
metricsReport("Random Forest", y_test, rfPreds)

## **Boosting**

In [None]:
boostClassifier = OneVsRestClassifier(GradientBoostingClassifier())
boostClassifier.fit(X_train_vec, y_train)
boostPreds = boostClassifier.predict(X_test_vec)
metricsReport("Boosting", y_test, boostPreds)

## **Multinomial Naive Bayes**

In [None]:
nb = OneVsRestClassifier(MultinomialNB())
nb.fit(X_train_vec, y_train)

nbPreds = nb.predict(X_test_vec)
metricsReport("Multinomial NB", y_test, nbPreds)

## **Linear SVC**

In [None]:
svm = OneVsRestClassifier(LinearSVC(), n_jobs=-1)
svm.fit(X_train_vec, y_train)

svmPreds = svm.predict(X_test_vec)
metricsReport("SVC Sq. Hinge Loss", y_test, svmPreds)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, svmPreds))

## **Label Powerset**

In [None]:
powerSetSVC = LabelPowerset(LinearSVC())
powerSetSVC.fit(X_train_vec, y_train)

powerSetSVCPreds = powerSetSVC.predict(X_test_vec)
metricsReport("Power Set SVC", y_test, powerSetSVCPreds)

## **Comparison of ML-Based models**

In [None]:
print("  Model Name " + " "*10 + "| Macro-F1 Score")
print("-------------------------------------------")
for key, value in Models_macro.items():
    print("  " + key, " "*(20-len(key)) + "|", value)
    print("-------------------------------------------")

# **LSTM with glove embedding and single output layer**

In [None]:
y_train =  df_train.drop(['Text', 'ID'], axis=1)
y_val = df_dev.drop(['Text', 'ID'], axis=1)
y_test = df_test.drop(['Text', 'ID'], axis=1)
y_train.head()

In [None]:
X_train = df_train['Text']
X_val = df_dev['Text']
X_test = df_test['Text']
X_train.head()

In [None]:
X_train.shape

In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 256

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_val = pad_sequences(X_val, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
print(X_train.shape)
X_train

In [None]:
embeddings_dictionary = dict()

glove_file = open('../input/d/sarfrazahmad307/emotions/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = LSTM(128)(embedding_layer)
dense_layer_1 = Dense(5, activation='sigmoid')(LSTM_Layer_1)
model = Model(inputs=deep_inputs, outputs=dense_layer_1)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
print(model.summary())

In [None]:
plot_model(model, to_file='model_plot4a.png', show_shapes=True, show_layer_names=True)

In [None]:
history = model.fit(X_train, y_train, batch_size=128, epochs=5, verbose=1, validation_data=(X_val, y_val))

In [None]:
score = model.evaluate(X_test, y_test, verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
y_pred = model.predict(X_test, batch_size=128, verbose=True)

In [None]:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [None]:
y = np.array(y_test) >= 0.5
y_p = np.array(y_pred) >= 0.5

In [None]:
print(classification_report(y, y_p)) 

In [None]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

# **Word2Vec**

In [None]:
y_train =  df_train.drop(['Text', 'ID'], axis=1)
y_val = df_dev.drop(['Text', 'ID'], axis=1)
y_test = df_test.drop(['Text', 'ID'], axis=1)
y_train.head()

In [None]:
X_train = df_train['Text']
X_val = df_dev['Text']
X_test = df_test['Text']
X_train.head()

In [None]:
X_train.apply(lambda x:len(str(x).split())).max()

In [None]:
max_features = 5000
maxlen = 256

In [None]:
token=tf.keras.preprocessing.text.Tokenizer(num_words=max_features)
token.fit_on_texts(X_train)

In [None]:
X_train_seq=token.texts_to_sequences(X_train)
X_test_seq=token.texts_to_sequences(X_test)

In [None]:
#zero pad the sequences
from keras.preprocessing import sequence, text
X_train_pad = sequence.pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = sequence.pad_sequences(X_test_seq, maxlen=maxlen)

word_index = token.word_index

len(token.word_index)##251102

In [None]:
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
!gzip -d GoogleNews-vectors-negative300.bin.gz
!ls -l

In [None]:
from gensim.models import Word2Vec, KeyedVectors
# Load pretrained Glove model (in word2vec form)
word2vec_model = KeyedVectors.load_word2vec_format("./GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
#Embedding length based on selected model - we are using 50d here.
embedding_vector_length = 300

In [None]:
#Initialize embedding matrix
embedding_matrix = np.zeros((max_features + 1, embedding_vector_length))
print(embedding_matrix.shape)

In [None]:
for word, i in sorted(token.word_index.items(),key=lambda x:x[1]):
    if i > (max_features+1):
        break
    try:
        embedding_vector = word2vec_model[word] #Reading word's embedding from Glove model for a given word
        embedding_matrix[i] = embedding_vector
    except:
        pass
        
embedding_matrix

In [None]:
#Initialize model
import tensorflow as tf
tf.keras.backend.clear_session()
model = tf.keras.Sequential()

In [None]:
from keras.layers.recurrent import LSTM, GRU,SimpleRNN

# A simpleRNN without any pretrained embeddings and one dense layer
model = Sequential()
model.add(tf.keras.layers.Embedding(max_features + 1, #Vocablury size
                                    embedding_vector_length, #Embedding size
                                    weights=[embedding_matrix], #Embeddings taken from pre-trained model
                                    trainable=False, #As embeddings are already available, we will not train this layer. It will act as lookup layer.
                                    input_length=maxlen) #Number of words in each review
         )
model.add(SimpleRNN(100))
model.add(Dense(5, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:
history = model.fit(X_train_pad,
                    y_train,
                    epochs=10,
                    batch_size=32,          
                    validation_data=(X_test_pad, y_test))

In [None]:
y_pred = model.predict(X_test_pad, batch_size=1000, verbose=True)

In [None]:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

y = np.array(y_test) >= 0.5
y_p = np.array(y_pred) >= 0.5

In [None]:
print(classification_report(y, y_p)) 

In [None]:
accuracy_score(y,y_p)