In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install ktrain

Collecting ktrain
[?25l  Downloading https://files.pythonhosted.org/packages/99/67/31cab9d7c0e23333aebc28b082659c1528f9ab7e22d00e7237efe4fc14f6/ktrain-0.26.2.tar.gz (25.3MB)
[K     |████████████████████████████████| 25.3MB 128kB/s 
[?25hCollecting scikit-learn==0.23.2
[?25l  Downloading https://files.pythonhosted.org/packages/f4/cb/64623369f348e9bfb29ff898a57ac7c91ed4921f228e9726546614d63ccb/scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8MB)
[K     |████████████████████████████████| 6.8MB 23.5MB/s 
Collecting langdetect
[?25l  Downloading https://files.pythonhosted.org/packages/56/a3/8407c1e62d5980188b4acc45ef3d94b933d14a2ebc9ef3505f22cf772570/langdetect-1.0.8.tar.gz (981kB)
[K     |████████████████████████████████| 983kB 58.4MB/s 
Collecting cchardet
[?25l  Downloading https://files.pythonhosted.org/packages/80/72/a4fba7559978de00cf44081c548c5d294bf00ac7dcda2db405d2baa8c67a/cchardet-2.1.7-cp37-cp37m-manylinux2010_x86_64.whl (263kB)
[K     |██████████████████████████

In [None]:
import pandas as pd
import numpy as np

import ktrain
from ktrain import text

## 1. Import Data

In [None]:
data_train = pd.read_csv('/content/drive/MyDrive/nlp-text-emotion/data/data_train.csv', encoding='utf-8')
data_test = pd.read_csv('/content/drive/MyDrive/nlp-text-emotion/data/data_test.csv', encoding='utf-8')

X_train = data_train.Text.tolist()
X_test = data_test.Text.tolist()

y_train = data_train.Emotion.tolist()
y_test = data_test.Emotion.tolist()

data = data_train.append(data_test, ignore_index=True)

class_names = ['joy', 'sadness', 'fear', 'anger', 'neutral']

print('size of training set: %s' % (len(data_train['Text'])))
print('size of validation set: %s' % (len(data_test['Text'])))
print(data.Emotion.value_counts())

data.head(10)

size of training set: 7934
size of validation set: 3393
joy        2326
sadness    2317
anger      2259
neutral    2254
fear       2171
Name: Emotion, dtype: int64


Unnamed: 0,Emotion,Text
0,neutral,There are tons of other paintings that I thin...
1,sadness,"Yet the dog had grown old and less capable , a..."
2,fear,When I get into the tube or the train without ...
3,fear,This last may be a source of considerable disq...
4,anger,She disliked the intimacy he showed towards so...
5,sadness,When my family heard that my Mother's cousin w...
6,joy,Finding out I am chosen to collect norms for C...
7,anger,A spokesperson said : ` Glen is furious that t...
8,neutral,Yes .
9,sadness,"When I see people with burns I feel sad, actua..."


In [None]:
encoding = {
    'joy': 0,
    'sadness': 1,
    'fear': 2,
    'anger': 3,
    'neutral': 4
}

# Integer values for each class
y_train = [encoding[x] for x in y_train]
y_test = [encoding[x] for x in y_test]

## 2. Data preprocessing

* The text must be preprocessed in a specific way for use with BERT. This is accomplished by setting preprocess_mode to ‘bert’. The BERT model and vocabulary will be automatically downloaded

* BERT can handle a maximum length of 512, but let's use less to reduce memory and improve speed. 

In [None]:
(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=X_train, y_train=y_train,
                                                                       x_test=X_test, y_test=y_test,
                                                                       class_names=class_names,
                                                                       preprocess_mode='bert',
                                                                       maxlen=350, 
                                                                       max_features=35000)

downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


task: text classification


## 2. Training and validation


Loading the pretrained BERT for text classification 

In [None]:
model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)

Is Multi-Label? False
maxlen is 350
done.


Wrap it in a Learner object

In [None]:
learner = ktrain.get_learner(model, train_data=(x_train, y_train), 
                             val_data=(x_test, y_test),
                             batch_size=6)

In [None]:
learner.fit_onecycle(2e-5, 3)



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f95c524dbd0>

Validation

In [None]:
learner.validate(val_data=(x_test, y_test), class_names=class_names)

              precision    recall  f1-score   support

         joy       0.86      0.84      0.85       707
     sadness       0.80      0.80      0.80       676
        fear       0.85      0.84      0.84       679
       anger       0.79      0.80      0.79       693
     neutral       0.80      0.83      0.82       638

    accuracy                           0.82      3393
   macro avg       0.82      0.82      0.82      3393
weighted avg       0.82      0.82      0.82      3393



array([[592,  15,  21,  15,  64],
       [ 16, 540,  35,  60,  25],
       [ 20,  37, 568,  41,  13],
       [ 19,  62,  30, 554,  28],
       [ 39,  22,  12,  34, 531]])

#### Testing with other inputs

In [None]:
# predictor = ktrain.get_predictor(learner.model, preproc)
predictor = ktrain.load_predictor('/content/drive/MyDrive/nlp-text-emotion/models/bert_model/')
predictor.get_classes()

['joy', 'sadness', 'fear', 'anger', 'neutral']

In [None]:
import time 

message = 'I am good'

start_time = time.time() 
prediction = predictor.predict(message)

print('predicted: {} ({:.2f})'.format(prediction, (time.time() - start_time)))

predicted: neutral (0.09)


## 4. Saving Bert model


In [None]:
# let's save the predictor for later use
predictor.save("/content/drive/MyDrive/nlp-text-emotion/models/bert_model")

In [None]:
df = pd.read_csv('/content/drive/MyDrive/nlp-text-emotion/data/tweetsCopy.csv',index_col='id')

In [None]:
df.head()

Unnamed: 0_level_0,Author,Text,User Location,Country_Code,Tweet Country,created_at,Unnamed: 7
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.374311e+18,johncorden47,: If you care for a loved one and you're elig...,coventry,,,2021-03-23 10:45:31,
1.374311e+18,asiatimesonline,"Rapid spread of new Covid-19 variants, snail-p...",Asia,,,2021-03-23 10:45:32,
1.374311e+18,Imperial_Stats,A year in review: Imperial's COVID-19 Response...,"London, GBR",GBR,,2021-03-23 10:45:32,
1.374311e+18,PakinUSA,Rep. Tom Souzzi lauded the contributions of th...,"Washington, D.C.",USA,,2021-03-23 10:45:32,
1.374311e+18,KeaMotlokwa,": Truth be told, COVID-19 took so much away f...",South Africa,,,2021-03-23 10:45:33,


In [None]:
df.drop('Unnamed: 7',
  axis='columns', inplace=True)

In [None]:
#df.iloc[11207,0]
df['Emotion'] = "Neutral"
rows = len(df.index)
i =0;
while i < rows:
  val = predictor.predict(str(df.iloc[i,1]))
  df.iloc[i,6] = val
  i = i + 1

In [None]:
df.to_csv("/content/drive/MyDrive/nlp-text-emotion/data/tweets_withEmotions.csv", index=False) 

In [None]:
rows = len(df.index)
i =0;
while i < rows:
  df.iloc[i,6] = encoding[df.iloc[i,6]]
  i = i + 1

In [None]:
df.to_csv("/content/drive/MyDrive/nlp-text-emotion/data/tweets_Categorized.csv", index=False) 

In [None]:
df

Unnamed: 0_level_0,Author,Text,User Location,Country_Code,Tweet Country,created_at,Emotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.374311e+18,johncorden47,: If you care for a loved one and you're elig...,coventry,,,2021-03-23 10:45:31,0
1.374311e+18,asiatimesonline,"Rapid spread of new Covid-19 variants, snail-p...",Asia,,,2021-03-23 10:45:32,1
1.374311e+18,Imperial_Stats,A year in review: Imperial's COVID-19 Response...,"London, GBR",GBR,,2021-03-23 10:45:32,2
1.374311e+18,PakinUSA,Rep. Tom Souzzi lauded the contributions of th...,"Washington, D.C.",USA,,2021-03-23 10:45:32,0
1.374311e+18,KeaMotlokwa,": Truth be told, COVID-19 took so much away f...",South Africa,,,2021-03-23 10:45:33,1
...,...,...,...,...,...,...,...
1.374324e+18,KasieeMarie,: #BREAKING: Minnesota reports zero new COVID...,"St Paul, MN",,,2021-03-23 11:34:42,2
1.374324e+18,praxpriya,: This is how Aghori Babas and some ancient B...,,PAK,,2021-03-23 11:34:42,2
1.374324e+18,YahyaKh08409321,: US President congratulated President &amp,Pakistani Nation on #PakistanResolutionDay. H...,PAK,,,2
1.374324e+18,MyNamesMiller,: Always heartwarming to watch this governmen...,"England, United Kingdom",PAK,,2021-03-23 11:34:43,0
