### How to run this notebook

Note: The folders with the airlinetweets and MELD data files, as given in the course, need to be placed in the same directory with this notebook.

### Table of Contents

* [1. Install and import](#section1)
* [2. Inspect the datasets](#section2)
   * [2.2 MELD](#section2.2)
* [3. Define the functions](#section3)
   * [3.1 load the data](#section3.1)
   * [3.2 convert the data to numerical representations](#section3.2)
   * [3.3 train the classifier and get the predictions](#section3.3)
   * [3.4 print the report](#section3.4)
* [4. Run the functions](#section4)
   * [4.2 Emotion analysis with MELD](#section4.2)
* [5. Apply the trained classifiers](#section5)
   * [5.1 Sentiment analysis trained with airli
   * [5.2 Emotion analysis trained with MELD data](#section5.2)

## 1. Install and import <a class="anchor" id ="section1"></a> 

In [None]:
%pip install pandas

In [None]:
import pathlib
import sklearn
import numpy
import nltk
import pandas as pd
from nltk.corpus import stopwords
from collections import Counter
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## 2. Inspect the datasets <a class="anchor" id ="section2"></a> 

##### 2.1 airlinetweets <a class="anchor" id ="section2.1"></a> 

##### 2.2 MELD <a class="anchor" id ="section2.2"></a> 

In [None]:
# MELD
# the textual data of MELD are provided in MELD folder.
# The data is separated in three structured csv files: train_sent_emo.csv, dev_sent_emo.csv, test_sent_emo.csv 
# for this type of data format, Pandas is very powerful and handy for data loading
# https://pandas.pydata.org
# Path to the MELD data
filepath_MELD = 'train_sent_emo.csv'# if you wanna observe other files, change 'train' to 'dev' or 'test'

# Create a dataframe object 'df_' by readig the file.
df_MELD = pd.read_csv(filepath_MELD)

# to fix encoding problems and replace the 'Utterance' columns with the clean strings
df_MELD['Utterance'] = df_MELD['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")

# to print out the first cell in the 'Utterance' column to inspect it.
print(df_MELD['Utterance'][0])
print('----------------------------------------------------------------------------------------------')  
# to print out the first 5 rows in the 'Utterance' column.
print(df_MELD.head(5))
print('----------------------------------------------------------------------------------------------')  
# to print out the last 5 rows in the 'Utterance' column.
print(df_MELD.tail(5))
print('----------------------------------------------------------------------------------------------')  
# to print out the last 5 rows in the 'Utterance' column without using print(), now much easier to read.
df_MELD.tail(5)
# to print out all emotion and sentiment values
print(df_MELD['Emotion'].value_counts())
print('----------------------------------------------------------------------------------------------')  
# to print out all emotion and sentiment values
print(df_MELD['Sentiment'].value_counts())

## 3. Define the functions <a class="anchor" id ="section3"></a>

##### 3.1 load the data <a class="anchor" id ="section3.1"></a> 

In [None]:
# load MELD data
def load_MELD_data():
        # Path to the training data
    filepath_MELD_train = 'train_sent_emo.csv'

    # Create a dataframe object 'dftrain' by readig the file.
    dftrain = pd.read_csv(filepath_MELD_train)

    ### The data has some problematic strings with encoding problems. The next code removes some of these from the utterances
    # Fixing encoding problems and replacing the 'Utterance' columns with the cleaned strings
    dftrain['Utterance'] = dftrain['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")
    # Path to the test data
    filepath_MELD_test = 'test_sent_emo.csv'
    dftest = pd.read_csv(filepath_MELD_test)
    dftest['Utterance'] = dftest['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")
        #to prepare the vectorization, we need to collect the sentences and labels to lists.
    #to prepare the training data
    training_instances=[]
    for utterance in dftrain['Utterance']:
        ### If your computer has performance issue, you can break the loop after 2000 instances to have less data, see the following two lines.
        #if index==2000:
        #    break
        training_instances.append(utterance)

    ### print the length of our list to see if al data are loaded
    print(len(training_instances))

    training_labels = []
    for label in dftrain['Emotion']:
        ### the index need to be set the same as in the previous for-loop
        #if index==2000:
        #    break
        training_labels.append(label)
    ### Check if we have the same number of labels
    print(len(training_labels))

    # to prepare the test data
    test_instances = []
    for utterance in dftest['Utterance']:
        test_instances.append(utterance)

    ### We use the same loop for the list of emotion labels that correspond with the vector representations of each utterance
    test_labels = []
    for label in dftest['Emotion']:
        test_labels.append(label)
    target_labels= list(set(test_labels+training_labels))
    
    return training_instances,test_instances,training_labels,test_labels,target_labels

##### 3.2 convert the data to numerical representations <a class="anchor" id ="section3.2"></a> 

In [None]:
# convert MELD data to numerical representation
def data_MELD_to_nrrepr(training_instances,test_instances,training_labels,test_labels):

        
    # Turn (utterances) train data into a vector 
    frequency_threshold = 4
    utterance_vec =CountVectorizer(min_df=frequency_threshold, # If a token appears fewer times than this, across all documents, it will be ignored
                                 tokenizer=nltk.word_tokenize # we use the nltk tokenizer
                                 ) # stopwords are removed

    training_count_vectors = utterance_vec.fit_transform(training_instances)
    print(training_count_vectors.shape)

    # Convert raw frequency counts into TF-IDF values
    # why TF-IDF? when we transfer the utterance to vectors, the CountVectorizer treat all words equally, recall BOW (bag-of-words)
    # the disadvantage is a more frequent word, such as 'and', may get more weight than a much more informative word.
    # TF-IDF solves this issue by giving less weight to words of high frequency that occur in many documents.
    # an extensive explanation https://www.freecodecamp.org/news/how-to-process-textual-data-using-tf-idf-in-python-cd2bbc0a94a3/
    #The shape remains the same but the values are now scores between zero and one.
    tfidf_transformer = TfidfTransformer()
    training_tfidf_vectors = tfidf_transformer.fit_transform(training_count_vectors)

    # Turn test data into a vector 
    test_count_vectors = utterance_vec.transform(test_instances)
    test_tfidf_vectors = tfidf_transformer.fit_transform(test_count_vectors)
    
    label_encoder = preprocessing.LabelEncoder()
# we feed this encoder with the complete list of labels from our data, both the training and test labels
    label_encoder.fit(training_labels+test_labels)
    
    training_classes = label_encoder.transform(training_labels)
    test_classes = label_encoder.transform(test_labels)  
    return training_tfidf_vectors,test_tfidf_vectors,training_classes,test_classes,utterance_vec

##### 3.3 train the classifier and get the predictions <a class="anchor" id ="section3.3"></a> 

In [None]:
# train classifier and make predictions
def train_n_pred_clf(docs_train, y_train,docs_test):
      # Now train the Multimoda Naive Bayes classifier with the training data,
    # and assign the trained classifier to clf
    clf = MultinomialNB().fit(docs_train, y_train)
    # Now let the classifier make predictions on the test data
    y_pred = clf.predict(docs_test)
    # why multimoda naive bayes classifier? because we are dealing with multiple labels (three categories from airlinetweets, eight from MELD), we need a multinomial classifier
    # we may also choose another one, SVM from sklearn: svm.LinearSVC, more info https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
    return y_pred,clf

##### 3.4 print the report <a class="anchor" id ="section3.4"></a>

In [None]:
# print the report
def show_report(y_test,y_pred,target_labels):
    report = classification_report(y_test,y_pred,digits = 7)
    print('Report')
    print(report)
    print('------------------------------------------------------------')
    # recall the labels are:
    print('Target labels')
    print(target_labels)
    print('------------------------------------------------------------')
    # print the confusion matrix, 
    # Here's an example of how to read it https://www.aboutdatablog.com/post/reading-a-confusion-matrix
    print('Confusion matrix')
    print(sklearn.metrics.confusion_matrix(y_test,y_pred))

## 4. Run the functions <a class="anchor" id ="section4"></a> 

##### 4.1 Sentiment analysis with airlinetweets <a class="anchor" id ="section4.1"></a> 

##### 4.2 Emotion analysis with MELD <a class="anchor" id ="section4.2"></a> 

In [None]:
#MELD
training_instances,test_instances,training_labels,test_labels,target_labels_MELD=load_MELD_data()
training_tfidf_vectors,test_tfidf_vectors,training_classes,test_classes,utterance_vec=data_MELD_to_nrrepr(training_instances,test_instances,training_labels,test_labels)
y_pred,clf_MELD=train_n_pred_clf(training_tfidf_vectors, training_classes,test_tfidf_vectors)
show_report(test_classes,y_pred,target_labels_MELD)

## 5. Apply the trained classifiers  <a class="anchor" id ="section5"></a> 
##### 5.1 Sentiment analysis trained with airlinetweets data <a class="anchor" id ="section5.1"></a> 

##### 5.2 Emotion analysis trained with MELD data <a class="anchor" id ="section5.2"></a> 

In [None]:
txt_emo=['Two thumbs up', 
               'I fell asleep halfway through', 
               "We can't wait for the sequel!!", 
               'I cannot recommend this highly enough', 
               'instant classic.', 
               'Steven Seagal was amazing.']
gold_emo=['joy','anger','joy','joy','neutral','surprise']

In [None]:
# We re-use airline_vec to transform it in the same way as the training data
# recall: txt_senti_counts is a matrix of documents( or all sentences, each row is the vector representation of a sentence.) 
txt_emo_counts = utterance_vec.transform(txt_emo)
#print(txt_senti_counts.shape)
tfidf_transformer = TfidfTransformer()
# we compute tf idf values
txt_emo_tfidf = tfidf_transformer.fit_transform(txt_emo_counts)
# have classifier make a prediction
y_pred_emo = clf_MELD.predict(txt_emo_tfidf)
print(y_pred_emo)

label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(target_labels_MELD)
gold_emo_classes = label_encoder.transform(gold_emo)

show_report(gold_emo_classes,y_pred_emo,target_labels_MELD)

In [None]:
# to view which sentence gets what prediction
for review, predicted_label in zip(txt_emo, y_pred_emo):
    
    print('%s => %s' % (review, 
                        label_encoder.classes_[predicted_label]))

In [None]:
thing = []
for review, predicted_label in zip(txt_emo, y_pred_emo):
    content = review + "\t" + label_encoder.classes_[predicted_label]
    thing.append(content)

print(thing)
    

In [None]:
import nltk
filename = 'run_sent_emo.csv'
csv_data = []   

with open(filename, "r", encoding = 'utf-8') as infile:
    data = infile.readlines()
  
    for row in data:
        text = row.replace("\n", "")
        csv_data.append(text)
        
print(csv_data)

In [None]:
txt_emo_counts = utterance_vec.transform(csv_data)
#print(txt_senti_counts.shape)
tfidf_transformer = TfidfTransformer()
# we compute tf idf values
txt_emo_tfidf = tfidf_transformer.fit_transform(txt_emo_counts)
# have classifier make a prediction
y_pred_emo = clf_MELD.predict(txt_emo_tfidf)
print(y_pred_emo)

In [None]:
thing = ["Utterance\tEmotion\n"]
for review, predicted_label in zip(csv_data, y_pred_emo):
    content = review + "\t" + label_encoder.classes_[predicted_label] + "\n"
    thing.append(content)
    
fulltext_tsv="".join(thing)
print(fulltext_tsv)

save_filename = "results.tsv"
with open(save_filename, 'w', encoding = 'utf-8') as outfile:
    outfile.write(fulltext_tsv)   