In [33]:
# Importing the required packages
import pandas as pd
import re
import numpy as np
import os
import gensim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neural_network import MLPClassifier
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
import nltk
nltk.download('punkt')
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import collections
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from itertools import compress
import pandas as pd


# Setting the directory
os.chdir('C:/Users/Danish/JupyterNotebooks/IntrotoAI/Homework/HW4')

# Specifying columns as the data as extra columns in some rows
column_names = ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags']

# Importing the datasets
train = pd.read_csv('empatheticdialogues/train.csv', names = column_names)
valid = pd.read_csv('empatheticdialogues/valid.csv', names = column_names)
test = pd.read_csv('empatheticdialogues/test.csv', names = column_names)


# Excluding the first row as it has column names
train = train[1:]
valid = valid[1:]
test = test[1:]

# Appending validation dataset to train dataset in order to increase the number of training data points
data = train.append(valid)

### Data ETL

## 1. a.
# Filtering out all rows where the sentiment (‘context’) is not in the list of aforementioned list of sentiments 
# i.e. {'sad', 'jealous', 'joyful', 'terrified'}.
data_context_filt = data.loc[data['context'].isin(['sad', 'jealous', 'joyful', 'terrified'])]
test_data_context_filt = test.loc[test['context'].isin(['sad', 'jealous', 'joyful', 'terrified'])]


## 1. b.
# synthesize your training attributes and labels 
# i.e. ‘utterance’ as the attributes and ‘context’ as the label
train_data = data_context_filt[['utterance', 'context']] 
test_data = test_data_context_filt[['utterance', 'context']] 

# Creating list of utterances
train_data_list = list(train_data['utterance'])
test_data_list = list(test_data['utterance'])


# Cleaning the dataset
# Removing special characters and numbers
train_data_list_cleaned = [re.sub('[^a-zA-Z\n\.]', ' ', string) for string in train_data_list]
# Removing full stop
train_data_list_cleaned = [string.replace(".", "") for string in train_data_list_cleaned]
# Removing extra spaces from beginning and end
train_data_list_cleaned = [string.strip() for string in train_data_list_cleaned]
# Converting to lowercase
train_data_list_cleaned = [string.lower() for string in train_data_list_cleaned]

test_data_list_cleaned = [re.sub('[^a-zA-Z\n\.]', ' ', string) for string in test_data_list]
test_data_list_cleaned = [string.replace(".", "") for string in test_data_list_cleaned]
test_data_list_cleaned = [string.strip() for string in test_data_list_cleaned]
test_data_list_cleaned = [string.lower() for string in test_data_list_cleaned]

### 2
## Converting the utterances into a sparse bag-of-words 

train_count_vectorizer = CountVectorizer()
X = train_count_vectorizer.fit_transform(train_data_list_cleaned)
encoding = X.toarray()
# Converting counts to 1 and 0
for arr in encoding:
    arr[arr > 0] = 1

### 3 
## The shortcomings with the previous representation are as follows
#    1. There are some words which are not useful in the model because they do not encode any useful information, like propositions. These words need to be removed to reduce the dimensions
#    2. Some words repaeat across multiple training data points and are thus less useful in differentiating between classes. These words need to be given lesser weights  
        
# Getting the list of stopwords and appending additional words to it
stopwords_list = list(set(stopwords.words('english')))
stopwords_list.extend(['comma', ''])  

# Removing the stopwords
train_data_stop_removed = []
for row in train_data_list_cleaned:
    tokens_without_sw = [word for word in row.split(" ") if not word in stopwords_list]
    tokens_without_sw = " ".join(tokens_without_sw)
    train_data_stop_removed.append(tokens_without_sw)
    
    
test_data_stop_removed = []
for row in test_data_list_cleaned:
    tokens_without_sw = [word for word in row.split(" ") if not word in stopwords_list]
    tokens_without_sw = " ".join(tokens_without_sw)
    test_data_stop_removed.append(tokens_without_sw)


# Creating the bag of words encoding again  
train_count_vectorizer = CountVectorizer()
X_train = train_count_vectorizer.fit_transform(train_data_stop_removed)

train_one_hot_encoding = X_train.toarray()

for arr in train_one_hot_encoding:
    arr[arr > 0] = 1
    


# Getting the labels
train_labels_unique = list(train_data['context'].unique())
label_mapper = {}
num = 0
for label in train_labels_unique:
    label_mapper[label] = num
    num += 1


train_labels = list(train_data['context'])
train_labels_encoded = []
for label in train_labels:
    train_labels_encoded.append(label_mapper[label])


### 4. Normalization
# Normalizing the training data using tfidf transformer 
train_tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
train_embedding_tfidf_transformer = train_tfidf_transformer.fit_transform(train_one_hot_encoding)


### 5. Building an SGD Classifier

# The error analysis must include the test accuracy, confusion matrix and a few misclassified examples and your thoughts on why those utterances were misclassified by the example.
X_train = train_embedding_tfidf_transformer
y_train = np.array(train_labels_encoded)
clf = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=1000)
clf.fit(X_train, y_train)
    
    
# Using training data vocabulary on test data so that the features are consistent    
test_count_vectorizer = CountVectorizer(vocabulary = train_count_vectorizer.get_feature_names())
X_test = test_count_vectorizer.fit_transform(test_data_stop_removed)

test_one_hot_encoding = X_test.toarray()

for arr in test_one_hot_encoding:
    arr[arr > 0] = 1

# Normalizing the test data  
test_tfidf_transformer = TfidfTransformer(smooth_idf=False,use_idf=True)
test_embedding_tfidf_transformer = test_tfidf_transformer.fit_transform(test_one_hot_encoding)

# Getting predictions on test data
test_predicted_labels = clf.predict(test_embedding_tfidf_transformer)

# Getting test labels
labels_test = list(test_data['context'])
labels_encoded_test = []
for label in labels_test:
    labels_encoded_test.append(label_mapper[label])
labels_encoded_test = np.array(labels_encoded_test)

print('Test accuracy using SGD classifier is :', np.mean(labels_encoded_test == test_predicted_labels))

f1_score_vector = f1_score(labels_encoded_test, test_predicted_labels, average=None)

print('Confusion matrix using SGD classifier is :', confusion_matrix(labels_encoded_test, test_predicted_labels))

print('f1 score using SGD classifier is :', np.mean(f1_score_vector))

# 
misclassified_cleaned = list(compress(test_data_list_cleaned, list(labels_encoded_test != test_predicted_labels)))
misclassified_stop_removed = list(compress(test_data_stop_removed, list(labels_encoded_test != test_predicted_labels)))
mapper_label = dict([(value, key) for key, value in label_mapper.items()])
labels_test = [mapper_label[x] for x in labels_encoded_test]
predicted_labels_test = [mapper_label[x] for x in test_predicted_labels]
misclassified_actual = list(compress(labels_test, list(labels_encoded_test != test_predicted_labels)))
misclassified_predicted = list(compress(predicted_labels_test, list(labels_encoded_test != test_predicted_labels)))
df = pd.DataFrame(list(zip(misclassified_cleaned, misclassified_stop_removed, misclassified_actual, misclassified_predicted)), columns = ['cleaned', 'stop removed', 'actual', 'predicted'])

# Thoughts on why some occurences are misclassified
'''
Example 1:
actual utterance: 'oh no   what happened to it'
utterance stop removed: 'oh happened'
actual label: 'sad'
predicted label: 'terrified'

Comment: Here we do not have any word that conveys an emotion apart from 'oh' which can be either sad or terrified.
         The original sentence without stop words removed would have been better to predict


Example 2:
actual utterance: 'thank you i appreciate that'
utterance stop removed: 'thank appreciate'
actual label: 'sad'
predicted label: 'joyful'

Comment: Here the predicted label is correct and it seems that the labelling is wrong
'''

### 6. Classifier using pretrained embeddings

# Tokenizing the data
train_tokens = [nltk.word_tokenize(sentences) for sentences in train_data_stop_removed]
train_y = np.array(train_labels_encoded)

test_tokens = [nltk.word_tokenize(sentences) for sentences in test_data_stop_removed]
test_y = np.array(labels_encoded_test)

# Loading the pretrained word2vec model from Google
# https://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/
model = gensim.models.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True)

w2v = dict(zip(model.wv.index2word, model.wv.syn0))

# Reference for the class code below
# http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/

# Creating features from word embeddings
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = model.wv.vector_size

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = collections.defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

                
# Creating a pipeline for word2vec Vectorizer and MLP Classifier              
clf_tfidf_mlp = Pipeline([
    ("word2vec Vectorizer", TfidfEmbeddingVectorizer(w2v)),
    ("MLP Classifier", MLPClassifier(solver = 'adam', alpha = 1e-5, hidden_layer_sizes=(100,100,100), random_state=1, max_iter=500, learning_rate='adaptive', learning_rate_init=0.01))
    ])

# Fitting the model
X = train_tokens
y = train_labels_encoded
clf_tfidf_mlp.fit(X, y)


test_X = test_tokens

pred_y = clf_tfidf_mlp.predict(test_X)

print('Test accuracy using MLP classifier is :', np.mean(test_y == pred_y))

f1_score_vector = f1_score(test_y, pred_y, average=None)

print('f1-score for MLP classifier is :', np.mean(f1_score_vector))

print('Confusion matrix using MLP classifier is :', confusion_matrix(test_y, pred_y))

'''
Read the paper at https://arxiv.org/pdf/1811.00207.pdf and answer the following questions:
1) What does this paper mean by "fine-tuning" results? How might you use such fine-tuning in building an empathetic chatbot?
2) What properties of the transformer architecture make it well suited for this application?
3) Explain the metrics used to evaluate performance in Table 1 (P@1,100, AVG-BLEU, and PPL).
4) Which of the metrics do you think provides the best measure of performance of empathic systems and why?
5) Based on table 1 and 2, and your reading of the paper, what do you think would help the system get to human-level performance?
'''

### 1 
'''
Fine-tuning the results means training a pre-trained model on the data generated by the chatbot 
so that it can produce more contextual results. For example, BERT is a model pre-trained on English Wikipedia. 
In this paper, BERT is first trained on Reddit conversations and then fine-tuned on the actual dataset. 
The pre-training helps because a large amount of data is used for training deep learning models 
whereas the chatbot may not have produced a lot of data initially. 
Hence, the idea is to use a pre-trained model and then improve its parameters by training again on the chatbot data. 
'''

### 2
'''
1. Transformer models are attention based models, that is, they see the entire sentence as a whole unlike RNN 
where the sentence is processed one word at each time step. Transformer sees all words simultaneously and there is no 
backpropagation through time. They capture long term dependencies easily because they see all words at once.

2. Attention models solve both alignment and translation problem. Alignment is the problem in machine translation that 
identifies which parts of the input sequence are relevant to each word in the output. Rranslation is the process of 
using the relevant information to select the appropriate output.
'''

### 3
'''
P@1,100:
It is the accuracy of choosing the correct responses out of a hundred randomly selected examples in the test 
set. Here the actual response is included in the candidates.

AVG-BLEU: 
- The idea behind BLEU is the closer a machine translation is to a professional human translation, the better it is. BLEU 
  score measures the difference between human and machine translation output
- It looks at the presence or absence of particular words, as well as the ordering and the degree of distortion, that is, 
  how much they actually are separated in the output
- Its evaluation requires two inputs, a numerical translation closeness metric and a corpus of human reference translations.
  BLEU averages out various metrics using an n-gram method
- The result is typically measured on a 0 to 1 scale, with 1 representing perfect translation

PPL:
- Perplexity measures how well a model predicts a sample
- Perplexity of a random variable X may be defined as the perplexity of the distribution over its possible values x
- A low perplexity indicates the probability distribution is good at predicting the sample
'''

### 4
''' 
BLEU is useful because it is ubiquitous, which makes it easy to compare your model to benchmarks on the same task. 
However, BLEU doesn’t consider meaning and it doesn’t directly consider sentence structure.
Perplexity is often used as a quality measure for language models. Language model perplexity has been used for 
domain adaptation. 
We should use both these metrics to measure the performance of emphatic systems as they both have their own pros and cons.
'''

### 5
'''
- We need a much larger, relevant and correct set of training data to be able to reach human level performance. The dataset 
  that we have provides description of a situation, and the responses of subjects here may be different than how they would 
  have responded in the real world 
- We need more robust metrics for evaluating the performance of models that can consider meaning and sentence structure
'''

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Danish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Danish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  interactivity=interactivity, compiler=compiler, result=result)
  idf = np.log(n_samples / df) + 1


Test accuracy using SGD classifier is : 0.6063907044299202
Confusion matrix using SGD classifier is : [[212  24  37  25]
 [ 34 210  58  53]
 [ 56  48 234  36]
 [ 46  59  66 179]]
f1 score using SGD classifier is : 0.606285690789388


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Test accuracy using MLP classifier is : 0.5882352941176471
f1-score for MLP classifier is : 0.5884901228221464
Confusion matrix using MLP classifier is : [[213  30  32  23]
 [ 45 190  58  62]
 [ 50  50 226  48]
 [ 44  62  63 181]]


'\n- We need a much larger, relevant and correct set of training data to be able to reach human level performance. The dataset \n  that we have provides description of a situation, and the responses of subjects here may be different than how they would \n  have responded in the real world \n- We need more robust metrics for evaluating the performance of models that can consider meaning and sentence structure\n'