In [1]:
! pip install transformers==4.10.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.10.1
  Downloading transformers-4.10.1-py3-none-any.whl (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m33.9 MB/s[0m eta

# **Data**

# Emo Event dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

emoevent_train_csv_path = '/content/drive/MyDrive/Colab Notebooks/datasets/emoevent_train.csv'
emoevent_dev_csv_path = '/content/drive/MyDrive/Colab Notebooks/datasets/emoevent_test.csv'

emoevent_train = pd.read_csv(emoevent_train_csv_path)
emoevent_dev = pd.read_csv(emoevent_dev_csv_path)

In [4]:
print('emoevent_train shape:', emoevent_train.shape)
emoevent_train['tweet'].head(10)

emoevent_train shape: (5112, 2)


0    What is one thing that you can not live withou...
1    Hahahhaha bells will be ringing to show solida...
2    The Compassionate Civilization Collaborative (...
3    They say much was spared, but the images of HA...
4    "I may be small. I may be a girl, but I won’t ...
5    HASHTAG Today we have begun the definitive pha...
6    "The highest education is that which does not ...
7    “Hi Friends! Lots of people are making comment...
8    HASHTAG election: socialist party HASHTAG decl...
9    THAT EPISODE WAS FUCKING EVERYTHING... 🤯🤯🤯🤯🤯 H...
Name: tweet, dtype: object

In [5]:
emoevent_train['emotion'].head(10)

0    4
1    0
2    2
3    5
4    4
5    3
6    4
7    4
8    4
9    3
Name: emotion, dtype: int64

In [6]:
X_train = emoevent_train['tweet']
y_train = emoevent_train['emotion']

X_test = emoevent_dev['tweet']
y_test = emoevent_dev['emotion']

In [7]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5112,)
(5112,)
(2191,)
(2191,)


In [8]:
# label: 'sadness': 0, 'neutral': 6, 'fear': 2, 'anger': 3, 'disgust': 4, 'surprise': 5, 'joy': 6
y_train.value_counts()

4    2313
3    1427
1     536
5     291
0     274
6     165
2     106
Name: emotion, dtype: int64

In [9]:
y_test.value_counts()

4    992
3    612
1    229
5    125
0    118
6     70
2     45
Name: emotion, dtype: int64

# **Model**


In [10]:
# Fit and transform X_train using Tfidf Vectorizer with default parameters
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
print('tfidf train shape:', X_train_tfidf.shape)
print('tfidf train type:', X_train_tfidf.dtype)

X_test_tfidf = vectorizer.transform(X_test).toarray()
print('tfidf test:', X_test_tfidf.shape)


tfidf train shape: (5112, 9912)
tfidf train type: float64
tfidf test: (2191, 9912)


In [11]:
from sklearn.svm import LinearSVC
clf = LinearSVC().fit(X_train_tfidf, y_train)

from sklearn import metrics
predicted = clf.predict(X_test_tfidf)

acc = metrics.accuracy_score(y_test, predicted)
print('accuracy is: ', acc*100)

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,predicted))
print('\n')
print(classification_report(y_test,predicted))

accuracy is:  53.40027384755819
[[  5  21   0   6  83   2   1]
 [ 14  48   2  21 138   4   2]
 [  1   3   1   3  36   1   0]
 [  0   7   0 354 239   8   4]
 [ 19  47   3 194 712  15   2]
 [  4   8   0  12  53  47   1]
 [  1   3   0  15  47   1   3]]


              precision    recall  f1-score   support

           0       0.11      0.04      0.06       118
           1       0.35      0.21      0.26       229
           2       0.17      0.02      0.04        45
           3       0.59      0.58      0.58       612
           4       0.54      0.72      0.62       992
           5       0.60      0.38      0.46       125
           6       0.23      0.04      0.07        70

    accuracy                           0.53      2191
   macro avg       0.37      0.28      0.30      2191
weighted avg       0.50      0.53      0.50      2191



# Augmentation using Bertarg

In [12]:
! pip install nlpaug==1.1.7

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nlpaug==1.1.7
  Downloading nlpaug-1.1.7-py3-none-any.whl (405 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m405.1/405.1 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.7


In [13]:
# apply augmentation to the train data and save the results into a file
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf

from nlpaug.util import Action
import nlpaug.flow as naf

aug_bert = naf.Sequential([naf.Sometimes([
    naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert", device ='cuda')]),
    naf.Sometimes([naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute", device ='cuda')
])])

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [15]:
from tqdm.notebook import tqdm
tqdm.pandas()

data = {'text': X_train, 'label': y_train}
base_train = pd.DataFrame.from_dict(data)
base_train.shape

(5112, 2)

In [16]:
rep = 5 # how many repetitions of the record


bert_train = base_train.copy()
bert_train['paraphrase'] = bert_train['text'].progress_apply(lambda x:aug_bert.augment(x, rep))
bert_train = bert_train.explode('paraphrase').reset_index(drop=True)


  0%|          | 0/5112 [00:00<?, ?it/s]

In [17]:
bert_train.head(20)

Unnamed: 0,text,label,paraphrase
0,What is one thing that you can not live withou...,4,his help will one thing gone that children can...
1,What is one thing that you can not live withou...,4,now! one thing that you can not part without? ...
2,What is one thing that you can not live withou...,4,what is which one thing. that nowadays you can...
3,What is one thing that you can not live withou...,4,what be h truly 1 rare thing that you can not ...
4,What is one thing that you can not live withou...,4,what is essentially one thing that you can rat...
5,Hahahhaha bells will be ringing to show solida...,0,& hahahhaha bells will absolutely be down ring...
6,Hahahhaha bells will be ringing to show solida...,0,hahahhaha bells would say ringing to show soli...
7,Hahahhaha bells will be ringing to show solida...,0,what hahahhaha bells will truly be ringing to ...
8,Hahahhaha bells will be ringing to show solida...,0,hahahhaha done up? will be ringing to show sol...
9,Hahahhaha bells will be ringing to show solida...,0,more bells will be ringing to show solidarity ...


# Model with augmented train data (paraphrased data)

In [18]:
# Fit and transform X_train using Tfidf Vectorizer with default parameters
from sklearn.feature_extraction.text import TfidfVectorizer

# !! we should concatenate original train with paraphrased train
new_data = X_train.append(bert_train['paraphrase'])

vectorizer = TfidfVectorizer() #max_features=3000)
X_train_tfidf = vectorizer.fit_transform(new_data).toarray() #bert_train['paraphrase']).toarray()
print('tfidf train shape:', X_train_tfidf.shape)
print('tfidf train type:', X_train_tfidf.dtype)

X_test_tfidf = vectorizer.transform(X_test).toarray()
print('tfidf test:', X_test_tfidf.shape)


  new_data = X_train.append(bert_train['paraphrase'])


tfidf train shape: (30672, 16490)
tfidf train type: float64
tfidf test: (2191, 16490)


In [19]:
from sklearn.svm import LinearSVC

# !! we should concatenate original labels with paraphrased text labels
new_labels = y_train.append(bert_train['label'])

clf = LinearSVC().fit(X_train_tfidf, new_labels) # bert_train['label'])

from sklearn import metrics
predicted = clf.predict(X_test_tfidf)

acc = metrics.accuracy_score(y_test, predicted)
print('accuracy is: ', acc*100)

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,predicted))
print('\n')
print(classification_report(y_test,predicted))

  new_labels = y_train.append(bert_train['label'])


accuracy is:  50.52487448653583
[[  9  31   2   9  63   3   1]
 [ 17  43   4  20 136   8   1]
 [  0   2   3   1  37   2   0]
 [  4   7   0 350 238   8   5]
 [ 25  62   8 218 642  23  14]
 [  4   9   0  10  47  54   1]
 [  5   3   0  18  36   2   6]]


              precision    recall  f1-score   support

           0       0.14      0.08      0.10       118
           1       0.27      0.19      0.22       229
           2       0.18      0.07      0.10        45
           3       0.56      0.57      0.57       612
           4       0.54      0.65      0.59       992
           5       0.54      0.43      0.48       125
           6       0.21      0.09      0.12        70

    accuracy                           0.51      2191
   macro avg       0.35      0.30      0.31      2191
weighted avg       0.48      0.51      0.49      2191



# Create CSV file from augmented dataset

In [22]:
!cd '/content/drive/My Drive/Colab Notebooks/NLP Augmentation/Datasets/'
bert_train.to_csv('/content/drive/My Drive/Colab Notebooks/datasets/emoevent_BertAug_5.csv', encoding='utf-8', index=False, sep=',')

/bin/bash: line 0: cd: /content/drive/My Drive/Colab Notebooks/NLP Augmentation/Datasets/: No such file or directory
