In [None]:
!pip install transformers



In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
from tensorflow import keras
from tensorflow.keras.layers import Dense,Dropout, Input
from tqdm import tqdm
import pickle
from sklearn.metrics import confusion_matrix,f1_score,classification_report
import matplotlib.pyplot as plt
import itertools
from sklearn.utils import shuffle
from tensorflow.keras import regularizers
from transformers import *
from transformers import BertTokenizer, TFBertModel, BertConfig,TFDistilBertModel,DistilBertTokenizer,DistilBertConfig

In [None]:
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/IFT-6010-project/Senti4SD_GoldStandard_EmotionPolarity.xlsx', sheet_name='Primary')

In [None]:
df.tail()

Unnamed: 0,Text,Label,Comments
221,Is `PrefixFilter` also expanded into boolean c...,neutral,
222,Great! I'm glad you like it.,happiness,
223,"Awesome, that did it! Thanks David!",happiness,
224,Using a unique identifiers would allow you to ...,uncertainty,
225,Another very fast approach is the [seek method...,uncertainty,


In [None]:
df.shape

(226, 3)

In [None]:
df['Label'].value_counts()

uncertainty    57
happiness      55
sadness        41
neutral        41
worry          32
Name: Label, dtype: int64

In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/IFT-6010-project/

/content/drive/MyDrive/Colab Notebooks/IFT-6010-project


In [None]:
num_classes = 5
max_len = 128

In [None]:
# Loading DistilBERT Tokenizer and the DistilBERT model
dbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_transform', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [None]:
def create_model():
    inps = Input(shape = (max_len,), dtype='int64')
    masks= Input(shape = (max_len,), dtype='int64')
    dbert_layer = dbert_model(inps, attention_mask=masks)[0][:,0,:]
    dense1 = Dense(512,activation='relu',kernel_regularizer=regularizers.l2(0.01))(dbert_layer)
    dropout1= Dropout(0.2)(dense1)
    dense2 = Dense(256,activation='relu',kernel_regularizer=regularizers.l2(0.01))(dropout1)
    dense3 = Dense(128,activation='relu',kernel_regularizer=regularizers.l2(0.01))(dense2)
    pred = Dense(num_classes, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dense3)
    model = tf.keras.Model(inputs=[inps,masks], outputs=pred)
    return model

In [None]:
# Load saved model
trained_model = create_model()

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)

trained_model.compile(loss=loss,optimizer=optimizer, metrics=[metric])
trained_model.load_weights('dbert_model.h5')



In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Preprocessing and cleaning functions¶

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def clean_stopwords_shortwords(w):
    stopwords_list=stopwords.words('english')
    words = w.split() 
    clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2]
    return " ".join(clean_words) 

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w=clean_stopwords_shortwords(w)
    w=re.sub(r'@\w+', '',w)
    return w

In [None]:
target_names = ['happiness', 'sadness', 'worry', 'uncertainty', 'neutral']

In [None]:
def predict_sentiment(sent):
  processed_sent = preprocess_sentence(sent)
  # Prepare the model input
  input_id=[]
  attention_mask=[]

  dbert_inps=dbert_tokenizer.encode_plus(processed_sent,add_special_tokens = True,max_length =max_len,pad_to_max_length = True,return_attention_mask = True,truncation=True)
  input_id.append(dbert_inps['input_ids'])
  attention_mask.append(dbert_inps['attention_mask'])

  input_id=np.asarray(input_id)
  attention_mask=np.array(attention_mask)

  example_pred = trained_model.predict([input_id,attention_mask],batch_size=1)
  example_pred_labels = example_pred.argmax(axis=1)

  output = {
      'original sentence': sent,
      'processed sentence': processed_sent,
      'predicted_label':target_names[example_pred_labels[0]],
      'confidence_scores': [(target_names[i], example_pred[0][i]) for i in range(0, len(target_names))]
      }
  return output

In [None]:
df.iloc[0], df.iloc[0]['Text']

(Text        Excellent, happy to help! If you don't mind, c...
 Label                                               happiness
 Comments                                                  NaN
 Name: 0, dtype: object,
 "Excellent, happy to help! If you don't mind, can you accept my answer?")

In [None]:
predict_sentiment(df.iloc[35]['Text'])



{'confidence_scores': [('happiness', 0.0030835222),
  ('sadness', 0.011090083),
  ('worry', 0.0077708825),
  ('uncertainty', 0.9749171),
  ('neutral', 0.0031383953)],
 'original sentence': "I have a big problem. I've created site, using jQuery, but it runs very slowly. On mobile phones it's terrible! And I don't know, what's wrong... Someone can help me? Links doesn't works yet, because I want use CMS on this layout, but before I want to little optimise these scripts. Here's test site: And here are the scripts:",
 'predicted_label': 'uncertainty',
 'processed sentence': 'big problem created site using jquery runs slowly mobile phones terrible know wrong someone help links works yet want use cms layout want little optimise scripts test site scripts'}

In [None]:
df['Predicted_label'] = df['Text'].map(lambda s: predict_sentiment(s))



In [None]:
df.head()

Unnamed: 0,Text,Label,Comments,Predicted_label
0,"Excellent, happy to help! If you don't mind, c...",happiness,,"{'original sentence': 'Excellent, happy to hel..."
1,@DrabJay: excellent suggestion! Code changed. :-),happiness,,{'original sentence': '@DrabJay: excellent sug...
2,I didn't select an answer because even though ...,neutral,,{'original sentence': 'I didn't select an answ...
3,I have attached below,neutral,,"{'original sentence': 'I have attached below',..."
4,Excellent! Thank you for your perseverence :),happiness,,{'original sentence': 'Excellent! Thank you fo...


In [None]:
df['Predicted_target'] = df['Predicted_label'].map(lambda s: s['predicted_label'])

In [None]:
df.head()

Unnamed: 0,Text,Label,Comments,Predicted_label,Predicted_target
0,"Excellent, happy to help! If you don't mind, c...",happiness,,"{'original sentence': 'Excellent, happy to hel...",happiness
1,@DrabJay: excellent suggestion! Code changed. :-),happiness,,{'original sentence': '@DrabJay: excellent sug...,happiness
2,I didn't select an answer because even though ...,neutral,,{'original sentence': 'I didn't select an answ...,neutral
3,I have attached below,neutral,,"{'original sentence': 'I have attached below',...",neutral
4,Excellent! Thank you for your perseverence :),happiness,,{'original sentence': 'Excellent! Thank you fo...,happiness


In [None]:
df['Predicted_target_vector'] = df['Predicted_target'].map(lambda s: [int(x==s) for x in target_names])

In [None]:
df['Label_vector'] = df['Label'].map(lambda s: [int(x==s) for x in target_names])

In [None]:
df.iloc[140:150]

Unnamed: 0,Text,Label,Comments,Predicted_label,Predicted_target,Predicted_target_vector,Label_vector
140,As TraumaPony said. Simply load the main game ...,neutral,,{'original sentence': 'As TraumaPony said. Sim...,worry,"[0, 0, 1, 0, 0]","[0, 0, 0, 0, 1]"
141,Congratulations on the new job! Relax and keep...,happiness,,{'original sentence': 'Congratulations on the ...,happiness,"[1, 0, 0, 0, 0]","[1, 0, 0, 0, 0]"
142,Maybe it helps someone else: you can use www.b...,uncertainty,,{'original sentence': 'Maybe it helps someone ...,uncertainty,"[0, 0, 0, 1, 0]","[0, 0, 0, 1, 0]"
143,The solution is crying out for the MSMQ soluti...,sadness,,{'original sentence': 'The solution is crying ...,uncertainty,"[0, 0, 0, 1, 0]","[0, 1, 0, 0, 0]"
144,Sorting the hashes is also a possibility that ...,uncertainty,,{'original sentence': 'Sorting the hashes is a...,uncertainty,"[0, 0, 0, 1, 0]","[0, 0, 0, 1, 0]"
145,AT what stage are you populating the ListBox? ...,worry,,{'original sentence': 'AT what stage are you p...,worry,"[0, 0, 1, 0, 0]","[0, 0, 1, 0, 0]"
146,We ended up using a Java applet. It was a nigh...,sadness,,{'original sentence': 'We ended up using a Jav...,uncertainty,"[0, 0, 0, 1, 0]","[0, 1, 0, 0, 0]"
147,"As an aside, if you can use a pull parser and ...",worry,,"{'original sentence': 'As an aside, if you can...",uncertainty,"[0, 0, 0, 1, 0]","[0, 0, 1, 0, 0]"
148,"This is a solution to a local problem, not an ...",uncertainty,,{'original sentence': 'This is a solution to a...,uncertainty,"[0, 0, 0, 1, 0]","[0, 0, 0, 1, 0]"
149,I am planning to use community sever for one o...,worry,,{'original sentence': 'I am planning to use co...,uncertainty,"[0, 0, 0, 1, 0]","[0, 0, 1, 0, 0]"


In [None]:
df.iloc[19]['Predicted_label']

{'confidence_scores': [('happiness', 0.0015935191),
  ('sadness', 0.0022074578),
  ('worry', 0.0013595837),
  ('uncertainty', 0.9929853),
  ('neutral', 0.0018540769)],
 'original sentence': 'Out of all that pseudocode, the only thing that really worries me is "extracts code samples from file". Reading files from a directory is trivial, saving a file is trivial. Regardless of the test framework I\'d spend most of my time focusing on the parsing bit. For direct testing, I\'d embed the snippets directly into the test case: Ah, I see another change I subtly made while writing the test: my ExamplesToCode.parse() returns an Array (or other iterable container), so that it can be tested apart from the iteration itself.',
 'predicted_label': 'uncertainty',
 'processed sentence': 'pseudocode thing really worries extracts code samples file reading files directory trivial saving file trivial regardless test framework spend time focusing parsing bit direct testing embed snippets directly test case 

In [None]:
print(classification_report(df['Label'],df['Predicted_target'],target_names=target_names))

              precision    recall  f1-score   support

   happiness       0.97      0.71      0.82        55
     sadness       0.29      0.39      0.33        41
       worry       0.81      0.32      0.46        41
 uncertainty       0.56      0.77      0.65        57
     neutral       0.24      0.28      0.26        32

    accuracy                           0.54       226
   macro avg       0.58      0.49      0.50       226
weighted avg       0.61      0.54      0.54       226



In [None]:
predict_sentiment('Hey, I\'m new to this site. I think it is great! Okay, here\'s the deal. I just downloaded Smule Ocarina. I was wondering how they made it so you can upload a song to the cloud. I might have an app idea that might incorporate this. How would I do this? What would I need?')



{'confidence_scores': [('happiness', 0.008777087),
  ('sadness', 0.01174387),
  ('worry', 0.9754412),
  ('uncertainty', 0.0022863264),
  ('neutral', 0.0017515811)],
 'original sentence': "Hey, I'm new to this site. I think it is great! Okay, here's the deal. I just downloaded Smule Ocarina. I was wondering how they made it so you can upload a song to the cloud. I might have an app idea that might incorporate this. How would I do this? What would I need?",
 'predicted_label': 'worry',
 'processed sentence': 'hey new site think great okay deal downloaded smule ocarina wondering made upload song cloud might app idea might incorporate would would need'}

In [None]:
df.map(lambda s: s['Predicted_label'])

Unnamed: 0,Text,Label,Comments,Predicted_label,Predicted_target,Predicted_target_vector,Label_vector
19,"Out of all that pseudocode, the only thing tha...",worry,,{'original sentence': 'Out of all that pseudoc...,uncertainty,"[0, 0, 0, 1, 0]","[0, 0, 1, 0, 0]"
46,"Triggering quirks mode is a terrible, terrible...",sadness,,{'original sentence': 'Triggering quirks mode ...,uncertainty,"[0, 0, 0, 1, 0]","[0, 1, 0, 0, 0]"
50,That's the simplest query to return the result...,neutral,,{'original sentence': 'That's the simplest que...,uncertainty,"[0, 0, 0, 1, 0]","[0, 0, 0, 0, 1]"
59,"External sheet Link not applicable, private be...",neutral,,{'original sentence': 'External sheet Link not...,uncertainty,"[0, 0, 0, 1, 0]","[0, 0, 0, 0, 1]"
63,You need to implement the interface so that wi...,neutral,,{'original sentence': 'You need to implement t...,uncertainty,"[0, 0, 0, 1, 0]","[0, 0, 0, 0, 1]"
79,You need to make the following changes to your...,worry,,{'original sentence': 'You need to make the fo...,uncertainty,"[0, 0, 0, 1, 0]","[0, 0, 1, 0, 0]"
82,Now I think it is not possible in c++ (without...,sadness,,{'original sentence': 'Now I think it is not p...,uncertainty,"[0, 0, 0, 1, 0]","[0, 1, 0, 0, 0]"
110,I'm not good at programming and my trying to o...,worry,,{'original sentence': 'I'm not good at program...,uncertainty,"[0, 0, 0, 1, 0]","[0, 0, 1, 0, 0]"
111,"Would use double.TryParse, it has performance ...",neutral,,{'original sentence': 'Would use double.TryPar...,uncertainty,"[0, 0, 0, 1, 0]","[0, 0, 0, 0, 1]"
114,FlexeLint is a commercial product which has no...,neutral,,{'original sentence': 'FlexeLint is a commerci...,uncertainty,"[0, 0, 0, 1, 0]","[0, 0, 0, 0, 1]"
