## 1. Data Preparation

### 1.1 Load data

In [1]:
import os

# select your GPU. Note that this should be set before you load tensorflow or pytorch.
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# To use multiple GPUs, combine all GPU ID with commas
# e.g. >>> os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,3'

import torch
# Check if any GPU is used
torch.cuda.is_available()

True

In [70]:
# import library
import re
import math
import string
import numpy as np 
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
from gensim.models import KeyedVectors
import data_mining_helpers as dmh

# sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, classification_report

# nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

# keras
# preprocess
import keras
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

print("gensim: " + gensim.__version__)
print("tensorflow: " + tf.__version__)
print("keras: " + keras.__version__)

gensim: 4.1.2
tensorflow: 2.7.0
keras: 2.7.0


In [2]:
import pandas as pd

### training data
data_identification = pd.read_csv("data_identification.csv") #1867535 rows
emotion = pd.read_csv("emotion.csv") #1455563 rows
sampleSubmission = pd.read_csv("sampleSubmission.csv") #411972 rows

In [3]:
import json
f = open('tweets_DM.json','r', encoding='utf')
data = []
for line in f.readlines():
    dic = json.loads(line)
    data.append(dic)
print(data[0])

{'_score': 391, '_index': 'hashtag_tweets', '_source': {'tweet': {'hashtags': ['Snapchat'], 'tweet_id': '0x376b20', 'text': 'People who post "add me on #Snapchat" must be dehydrated. Cuz man.... that\'s <LH>'}}, '_crawldate': '2015-05-23 11:42:47', '_type': 'tweets'}


In [4]:
dic = {
    'tweet_id':[],
    'text':[]
}
for d in data:
    for key in dic.keys():
        dic[key].append(d['_source']['tweet'][key])

tweet_df = pd.DataFrame(dic)
tweet_df.head() #1867535 rows × 2 columns

Unnamed: 0,tweet_id,text
0,0x376b20,"People who post ""add me on #Snapchat"" must be ..."
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #..."
2,0x28b412,"Confident of your obedience, I write to you, k..."
3,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>
4,0x2de201,"""Trust is not the same as faith. A friend is s..."


In [35]:
## split into train, test dataset

df = pd.merge(tweet_df, data_identification)
train = df[df['identification']=='train']
test = df[df['identification']=='test']
train = pd.merge(train, emotion)

In [6]:
## check emotion distribution
train['emotion'].value_counts()/len(train)

joy             0.354514
anticipation    0.171023
trust           0.141167
sadness         0.132895
disgust         0.095565
fear            0.043969
surprise        0.033478
anger           0.027389
Name: emotion, dtype: float64

### 1.2 Save data
The pickle module implements binary protocols for serializing and de-serializing a Python object structure.

In [None]:
## save to pickle file
train.to_pickle("train.pkl") 
test.to_pickle("test.pkl")

In [73]:
import pandas as pd

## load a pickle file
train = pd.read_pickle("train.pkl")
test = pd.read_pickle("test.pkl")

## 2. Feature engineering

### 2.1 Text preprocessing

nltk.TweetTokenizer keeps hashtags intact while nltk.word_tokenize doesn't.

In [17]:
# import custom emoji to adjectives about emotions mappings
emoji_dict = dmh.emoji
emojis = emoji_dict.keys()

# import custom frequent name to adjectives about emotions mappings
frequent_name_dict = dmh.frequent_name_dict
frequent_names = frequent_name_dict.keys()

In [18]:
import re
import nltk
import string
from nltk.tokenize import TweetTokenizer

tweet_tokenizer = TweetTokenizer(reduce_len=True)

def lemma_text(text):
    text = text.lower()
    grams = nltk.word_tokenize(text)
    grams = ["" if w == "useruser" else w for w in grams]
    grams = ["" if w == "hashtaghashtag" else w for w in grams]
    text = ' '.join(grams)
    
    return text

def text_preprocess(df):
    df['text'] = df['text'].apply(lambda s : s.replace('',''))
    df['tmp'] = df['text'].apply(lambda x: tweet_tokenizer.tokenize(x))
    
    # replace frequent name  with adjectives
    df['tmp'] = df['tmp'].apply(lambda u : [frequent_name_dict[w] if w in frequent_names else w for w in u])
    
    # replace @username with 
    p = re.compile('@*')
    df['tmp'] = df['tmp'].apply(lambda u : ["useruser" if p.match(w).span() != (0, 0) else w for w in u])
    
    # replace #hashtag with 
    p = re.compile('#*')
    df['tmp'] = df['tmp'].apply(lambda u : ["hashtaghashtag " + w[1:] if p.match(w).span() != (0, 0) else w for w in u])

    # replace common emojis with adjectives
    df['tmp'] = df['tmp'].apply(lambda u : [emoji_dict[w] if w in emojis else w for w in u])
        
    df['clean_text'] = df['tmp'].apply(lambda x : ' '.join(x)) 
    df.drop(['tmp'], axis=1, inplace=True)  
    df['clean_text'] = df['clean_text'].apply(lemma_text)
    
    # remove digits
    df['clean_text'] = df['clean_text'].str.replace('\d+', '') 

In [None]:
text_preprocess(train)
train.head()

In [14]:
text_preprocess(test)
test.head()

Unnamed: 0,tweet_id,text,identification,clean_text
2,0x28b412,"Confident of your obedience, I write to you, k...",test,"confident of your obedience , i write to you ,..."
4,0x2de201,"""Trust is not the same as faith. A friend is s...",test,`` trust is not the same as faith . a friend i...
9,0x218443,When do you have enough ? When are you satisfi...,test,when do you have enough ? when are you satisfi...
30,0x2939d5,"God woke you up, now chase the day #GodsPlan #...",test,"god woke you up , now chase the day godsplan ..."
33,0x26289a,"In these tough times, who do YOU turn to as yo...",test,"in these tough times , who do you turn to as y..."


In [15]:
## save to pickle file
train.to_pickle("new_preprocess_train.pkl") 
test.to_pickle("new_preprocess_train.pkl")

In [3]:
## load a pickle file
import pandas as pd
train = pd.read_pickle("new_preprocess_train.pkl")
train.head()

Unnamed: 0,tweet_id,text,identification,emotion,clean_text
0,0x376b20,"People who post ""add me on #Snapchat"" must be ...",train,anticipation,people who post `` add me on snapchat `` must...
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",train,sadness,"as we see , trump is dangerous to freepress ..."
2,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,train,fear,now issa is stalking tasha < lolface > < lolfa...
3,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,train,joy,thx for the best time tonight . what stories...
4,0x2c91a8,Still waiting on those supplies Liscus. <LH>,train,anticipation,still waiting on those supplies liscus . < lh >


In [4]:
## load a pickle file
import pandas as pd
test = pd.read_pickle("new_preprocess_test.pkl")
test.head()

Unnamed: 0,tweet_id,text,identification,clean_text
2,0x28b412,"Confident of your obedience, I write to you, k...",test,"confident of your obedience , i write to you ,..."
4,0x2de201,"""Trust is not the same as faith. A friend is s...",test,`` trust is not the same as faith . a friend i...
9,0x218443,When do you have enough ? When are you satisfi...,test,when do you have enough ? when are you satisfi...
30,0x2939d5,"God woke you up, now chase the day #GodsPlan #...",test,"god woke you up , now chase the day godsplan ..."
33,0x26289a,"In these tough times, who do YOU turn to as yo...",test,"in these tough times , who do you turn to as y..."


In [5]:
## IMPOTANT!!!! Make sure the test index is correct
test = pd.merge(sampleSubmission, test, left_on='id', right_on='tweet_id')
test = test[['id', 'text', 'clean_text']]
test.head()

Unnamed: 0,id,text,clean_text
0,0x2c7743,When your friends offer to bring you food 😭💘 #...,when your friends offer to bring you food bawl...
1,0x2c1eed,I've never let any money problems stop me.. I ...,i 've never let any money problems stop me .. ...
2,0x2826ea,@KurtSchlichter Being a Hobby Historian Chelse...,"being a hobby historian chelsea , let me say ..."
3,0x356d9a,#Cannabis offers a natural alternative treatme...,cannabis offers a natural alternative treatme...
4,0x20fd95,Last Friday off before school starts. I'm read...,last friday off before school starts . i 'm re...


## 3. Model

### 3.1.1 Load pre-train word embeddings model

In [13]:
! gunzip glove_twitter_27B_200d_txt.gz

In [91]:
import time
from gensim.models import KeyedVectors

# GloVe twitter
start = time.time()
#model_path = "data/glove.twitter.27B/word2vec.twitter.27B.200d.txt"
w2v_twitter_200_model = KeyedVectors.load_word2vec_format('glove_twitter_27B_200d_txt', binary=False)

print('load ok')
print(f'time : {time.time() - start} sec')

load ok
time : 91.3127691745758 sec


In [19]:
vector = w2v_twitter_200_model['language']  # Get numpy vector of a word
print(vector)

[-1.1019e-02 -8.4930e-02 -3.3063e-01  1.1144e-01 -7.9505e-01  1.8700e-01
  4.3807e-01  3.5574e-01 -1.6642e-01 -7.2243e-02 -3.4030e-01 -9.5610e-02
 -9.8300e-01  5.4471e-01 -7.0435e-02 -4.9007e-02  4.2244e-02  5.0077e-02
  1.7844e-01 -1.5049e+00  4.7136e-01  1.2922e-02  1.5759e-01  3.7176e-01
  2.5396e-01  1.1585e+00 -4.7953e-01  3.4519e-02 -7.7085e-02  1.6277e-01
  4.4839e-01  1.3274e-01 -3.8485e-02  6.3778e-02  3.3766e-01 -8.9009e-02
  2.9327e-02  5.4011e-01  1.1109e+00  1.3682e-01  4.4503e-01  2.4157e-01
  1.2933e-01 -1.2803e-01  4.7938e-01  3.8640e-02  2.9838e-01  6.6712e-01
  1.9765e-01 -7.8086e-01 -1.2398e-01  4.5755e-01 -9.3772e-02 -5.4343e-01
 -2.6566e-01 -2.9321e-01 -1.5474e-01  8.1505e-02 -3.5291e-01 -4.4827e-01
 -1.0661e-01 -2.2966e-01  1.6354e-01  5.0833e-02  1.7096e-02 -3.1755e-02
 -3.9022e-01  9.8955e-01  1.5476e-03  5.3068e-01 -3.2936e-01  1.1219e-01
 -3.1221e-01 -3.1554e-01  3.1378e-01 -3.1787e-01  5.6004e-01  7.0350e-01
  3.4807e-01  3.9264e-01 -2.2543e-01  1.6285e-01  9

In [20]:
#### print top 5 most similar words to "school"
w2v_twitter_200_model.most_similar('school', topn=5)

[('college', 0.796126127243042),
 ('class', 0.761738657951355),
 ('tomorrow', 0.7286202311515808),
 ('high', 0.7116101384162903),
 ('kids', 0.7063326239585876)]

### 3.1.2 Data preparation

In [6]:
MAX = 50000
MAX_SEQUENCE_LENGTH = 30
EMBEDDING_DIM = 200

In [7]:
## Give each word a unique number.

from tensorflow.keras.preprocessing.text import Tokenizer

tok = Tokenizer(num_words=MAX)
tok.fit_on_texts(pd.concat([train,test],ignore_index=True)['clean_text'])
#tok.fit_on_texts(train['clean_text'].values) 
print(len(tok.word_index))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  tok.fit_on_texts(pd.concat([train,test],ignore_index=True)['clean_text'])


508123


In [8]:
## Convert phrases into numbers.

train_encoded_phrase = tok.texts_to_sequences(train['clean_text'])
test_encoded_phrase = tok.texts_to_sequences(test['clean_text'])
len(max(train_encoded_phrase, key=len)), len(max(test_encoded_phrase, key=len))

(109, 95)

In [9]:
## Padding

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train = tf.keras.preprocessing.sequence.pad_sequences(train_encoded_phrase, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
X_test = tf.keras.preprocessing.sequence.pad_sequences(test_encoded_phrase, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

print(X_train.shape, X_test.shape) #( ,MAX_SEQUENCE_LENGTH)

(1455563, 30) (411972, 30)


In [12]:
## One hot encode the labels
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

label_encoder = LabelEncoder()
enc = label_encoder.fit_transform(train['emotion'])
y_train = to_categorical(enc)

print(y_train.shape, y_train[:5])

(1455563, 8) [[0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]]


In [None]:
## Creating the embedding matrix
import numpy as np
from time import sleep
from tqdm import tqdm

word_model = w2v_twitter_200_model
vocab_size = len(tok.word_index) + 1

embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in tok.word_index.items():
    if word in word_model.index_to_key:  
        # The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
        # Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
        embedding_matrix[i] = word_model[word]
    #else continue

In [13]:
## Split training data into train and validation

from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2,random_state=42)
print(X_train.shape, y_train.shape) #( ,30), ( ,8)
print(X_val.shape, y_val.shape)     #( ,30), ( ,8)

(1164450, 30) (1164450, 8)
(291113, 30) (291113, 8)


In [29]:
np.save('new_embedding_matrix.npy', embedding_matrix)

In [15]:
import numpy as np
loaded_embedding_matrix = np.load('new_embedding_matrix.npy')

In [17]:
loaded_embedding_matrix.shape #(len(word_index) 508123+1, EMBEDDING_DIM 200) 

(508124, 200)

### 3.1.3 Classification

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense , Embedding, LSTM, ReLU, Dropout, Bidirectional ,Flatten #,  Convolution2D
from tensorflow.keras.initializers import Constant
from tensorflow.keras.optimizers import RMSprop, Adam

vocab_size = len(tok.word_index) + 1
embedding_layer = Embedding(vocab_size, 
                            EMBEDDING_DIM, 
                            weights=[loaded_embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH, 
                            trainable=False)


model=Sequential()
model.add(embedding_layer)
#model.add(Embedding(input_dim=vocab_size,
#                    output_dim=EMBEDDING_DIM,
#                    input_length=MAX_SEQUENCE_LENGTH,
#                    embeddings_initializer=Constant(loaded_embedding_matrix))) 
hidden_nodes = 192
#model.add(CuDNNLSTM(hidden_nodes))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))
#model.add(LSTM(64,return_sequences=False))
model.add(Flatten())
model.add(Dense(8,activation='sigmoid')) 
#model.add(Dense(8,activation='softmax')) 
#model.compile(optimizer=RMSprop(learning_rate=1e-3),loss='categorical_crossentropy',metrics=['accuracy'])
model.compile(optimizer=Adam(learning_rate=0.01),loss='categorical_crossentropy', metrics=['accuracy'])

In [21]:
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 30, 200)           101624800 
                                                                 
 bidirectional_4 (Bidirectio  (None, 30, 128)          135680    
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 flatten_2 (Flatten)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 8)                 1032      
                                                                 
Total params: 101,860,328
Trainable params: 235,528
No

In [22]:
## train

epochs = 8
batch_size = 32

model.fit(X_train,y_train,validation_data=(X_val,y_val), epochs=epochs, batch_size=batch_size)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f9c94374e20>

### 3.1.4 Evaluation

In [23]:
## predict on validation set

pred = model.predict(X_val, batch_size=32)
print(pred)

[[2.51324065e-02 2.06384599e-01 9.88324136e-02 ... 1.99031815e-01
  1.29811347e-01 3.09721053e-01]
 [1.25599012e-01 4.47985113e-01 3.14357132e-01 ... 3.67616117e-01
  2.46505499e-01 4.41577822e-01]
 [1.41671836e-01 4.95341837e-01 2.84560323e-01 ... 4.43956703e-01
  2.25281477e-01 3.70145708e-01]
 ...
 [2.12457046e-01 1.14287704e-01 3.62575322e-01 ... 5.45460522e-01
  1.10623114e-01 1.72504291e-01]
 [5.19969035e-04 1.88773144e-02 3.09448666e-03 ... 4.93540149e-03
  2.29004095e-03 1.78789068e-02]
 [2.03558281e-01 3.92129421e-01 5.46513438e-01 ... 6.72288358e-01
  3.34591180e-01 2.94853836e-01]]


In [24]:
# decode prediction results into labels
def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

y_pred = label_decode(label_encoder, pred)

In [28]:
y_val = label_decode(label_encoder, y_val)
print(y_val)

['surprise' 'joy' 'joy' ... 'sadness' 'joy' 'disgust']


In [29]:
# calculate accuracy

correct_count = 0
for idx, emotion in enumerate(y_pred):
    if emotion == y_val[idx]:
        correct_count += 1
print(len(y_val), len(y_pred))
print(correct_count/len(y_pred))

291113 291113
0.5586353065648048


### 3.1.5 Prediction on public test data

In [30]:
## predict

pred = model.predict(X_test, batch_size=32)
print(pred)

[[0.01991662 0.23562595 0.04275774 ... 0.14225954 0.05078214 0.57360375]
 [0.02553182 0.4316085  0.08516142 ... 0.18353505 0.09098322 0.41320962]
 [0.09122821 0.06337459 0.545122   ... 0.55441225 0.13822158 0.09522866]
 ...
 [0.2049925  0.5078567  0.51867694 ... 0.5815341  0.28922695 0.2856254 ]
 [0.0320158  0.58817166 0.10578525 ... 0.16094169 0.10285322 0.59728754]
 [0.17390262 0.24875572 0.28943646 ... 0.39268878 0.13297367 0.20841748]]


In [31]:
# decode prediction results into labels
y_pred = label_decode(label_encoder, pred)

In [32]:
# transform into dataframe to submit to kaggle

sampleSubmission['emotion'] = y_pred
sampleSubmission.head()

Unnamed: 0,id,emotion
0,0x2c7743,joy
1,0x2c1eed,joy
2,0x2826ea,sadness
3,0x356d9a,joy
4,0x20fd95,joy


In [33]:
sampleSubmission.to_csv('submission9.csv', index=False)

### 3.2 Classification with pre-trained model

### 3.2.1 Preprocessing

In [8]:
import pandas as pd
import datasets
from datasets import Dataset, DatasetDict

# Convert dataframe to Dataset
train = Dataset.from_pandas(train[['text', 'emotion']])
train = train.remove_columns("__index_level_0__")



In [9]:
print(train)
print(train[:5])

Dataset({
    features: ['text', 'emotion'],
    num_rows: 1455563
})
{'text': ['People who post "add me on #Snapchat" must be dehydrated. Cuz man.... that\'s <LH>', '@brianklaas As we see, Trump is dangerous to #freepress around the world. What a <LH> <LH> #TrumpLegacy.  #CNN', 'Now ISSA is stalking Tasha 😂😂😂 <LH>', '@RISKshow @TheKevinAllison Thx for the BEST TIME tonight. What stories! Heartbreakingly <LH> #authentic #LaughOutLoud good!!', 'Still waiting on those supplies Liscus. <LH>'], 'emotion': ['anticipation', 'sadness', 'fear', 'joy', 'anticipation']}


#### Sentence processing

In [10]:
# the model you want to use. Available models can be found here: https://huggingface.co/models
MODEL_NAME = "cardiffnlp/twitter-roberta-base-emotion"

In [11]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

#### Label processing

In [12]:
# declare a new encoder and let it learn from the dataset

from sklearn.preprocessing import OneHotEncoder
import numpy as np
encoder = OneHotEncoder(handle_unknown='ignore')
labels = encoder.fit(np.array(train['emotion']).reshape(-1,1))

In [13]:
print(np.array(train['emotion']).reshape(-1,1))

[['anticipation']
 ['sadness']
 ['fear']
 ...
 ['joy']
 ['joy']
 ['joy']]


In [14]:
# check if still have 6 labels
LABEL_COUNT = len(encoder.categories_[0])
print(LABEL_COUNT)

8


In [16]:
encoder.transform(np.array(train['emotion']).reshape(-1,1)).toarray()
#encoder.inverse_transform

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
def preprocess(dataslice):
    
    out = tokenizer(dataslice['text'])
    out['label'] = encoder.transform(np.array(dataslice['emotion']).reshape(-1,1)).toarray()
    return out

train = train.map(preprocess, batched = True)

  0%|          | 0/1456 [00:00<?, ?ba/s]

In [16]:
print(train)
print(train[0])

Dataset({
    features: ['text', 'emotion', 'input_ids', 'attention_mask', 'label'],
    num_rows: 1455563
})
{'text': 'People who post "add me on #Snapchat" must be dehydrated. Cuz man.... that\'s <LH>', 'emotion': 'anticipation', 'input_ids': [0, 4763, 54, 618, 22, 4917, 162, 15, 849, 41513, 29465, 113, 531, 28, 36410, 8358, 4, 230, 4987, 313, 17220, 14, 18, 28696, 574, 725, 15698, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label': [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}


In [17]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### 3.2.2 Training

In [18]:
from transformers import RobertaForSequenceClassification
model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=8, ignore_mismatched_sizes=True)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import pickle

with open('Roberta_dataset.pickle', 'rb') as data:
    train = pickle.load(data)

In [19]:
from datasets import load_dataset

train_val_dataset = train.train_test_split(train_size=0.8)

In [21]:
from transformers import TrainingArguments, Trainer

OUTPUT_DIR = '/home/nlplab/rola/DM/Roberta3'
LEARNING_RATE = 2e-5
BATCH_SIZE = 8
EPOCH = 1

training_args = TrainingArguments(
    output_dir = OUTPUT_DIR,
    learning_rate = LEARNING_RATE,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    num_train_epochs = EPOCH,
    save_steps=100000,
    # you can set more parameters here if you want
)

# now give all the information to a trainer
trainer = Trainer(
    # set your parameters here
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = train_val_dataset['train'],
    eval_dataset = train_val_dataset['test'],
    tokenizer = tokenizer
)

In [22]:
! nvidia-smi

Thu Nov 24 18:20:09 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
| 47%   36C    P2    97W / 350W |   2888MiB / 12053MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [23]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: emotion, text.
***** Running training *****
  Num examples = 1164450
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 145557


Step,Training Loss
500,0.3117
1000,0.2878
1500,0.2705
2000,0.2624
2500,0.258
3000,0.254
3500,0.2437
4000,0.2363
4500,0.2376
5000,0.2288


Saving model checkpoint to /home/nlplab/rola/DM/Roberta3/checkpoint-100000
Configuration saved in /home/nlplab/rola/DM/Roberta3/checkpoint-100000/config.json
Model weights saved in /home/nlplab/rola/DM/Roberta3/checkpoint-100000/pytorch_model.bin
tokenizer config file saved in /home/nlplab/rola/DM/Roberta3/checkpoint-100000/tokenizer_config.json
Special tokens file saved in /home/nlplab/rola/DM/Roberta3/checkpoint-100000/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=145557, training_loss=0.19575007472526418, metrics={'train_runtime': 4846.3519, 'train_samples_per_second': 240.274, 'train_steps_per_second': 30.034, 'total_flos': 2.896544710976688e+16, 'train_loss': 0.19575007472526418, 'epoch': 1.0})

In [24]:
# Saving model for future use

model.save_pretrained(save_directory=OUTPUT_DIR)

Configuration saved in /home/nlplab/rola/DM/Roberta3/config.json
Model weights saved in /home/nlplab/rola/DM/Roberta3/pytorch_model.bin


### 3.3.3 Evaluation

In [68]:
from torch import nn

#predictions, labels, metrics = trainer.predict(train_val_dataset["test"])

predictions = torch.from_numpy(predictions)
predictions_after_softmax = nn.functional.softmax(predictions, dim = -1)


result = torch.argmax(predictions_after_softmax,dim=1).to("cuda:0")
pred_labels = torch.nn.functional.one_hot(result)
pred_emotion = encoder.inverse_transform(pred_labels.cpu()).flatten()
# get predictions
print("pred_level",pred_emotion[:10])
print("label",encoder.inverse_transform(labels)[:10])

pred_level ['sadness' 'joy' 'joy' 'joy' 'disgust' 'trust' 'disgust' 'sadness' 'fear'
 'joy']
label [['sadness']
 ['joy']
 ['joy']
 ['anticipation']
 ['disgust']
 ['trust']
 ['disgust']
 ['sadness']
 ['anticipation']
 ['joy']]


In [69]:
# calculate accuracy

true_emotion = encoder.inverse_transform(train_val_dataset["test"]["label"])

correct_count = 0
for idx, emotion in enumerate(pred_emotion):
    if emotion == true_emotion[idx]:
        correct_count += 1
print(len(pred_emotion))
print(len(true_emotion))
print(correct_count/len(pred_emotion))

291113
291113
0.6843081552524277


### 3.3.4 Prediction

In [4]:
# load the model that saved
from transformers import AutoModelForSequenceClassification  #DistilBertConfig, DistilBertModel
#OUTPUT_DIR = '/home/nlplab/rola/DM/model'
configuration = OUTPUT_DIR
mymodel = AutoModelForSequenceClassification.from_pretrained(configuration)

In [59]:
import pandas as pd
import datasets
from datasets import Dataset, DatasetDict

test = Dataset.from_pandas(test[['text']])
test = test.remove_columns("__index_level_0__")

In [61]:
print(test[0])

{'text': 'When your friends offer to bring you food 😭💘 #loyal #real <LH>'}


In [62]:
def preprocess_test(dataslice):
    
    out = tokenizer(dataslice['text'])
    return out

test = test.map(preprocess_test, batched = True)
print(test)

  0%|          | 0/412 [00:00<?, ?ba/s]

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 411972
})


In [63]:
from torch import nn

predictions, labels, metrics = trainer.predict(test)
predictions = torch.from_numpy(predictions)
predictions_after_softmax = nn.functional.softmax(predictions, dim = -1)


result = torch.argmax(predictions_after_softmax,dim=1)
pred_labels = torch.nn.functional.one_hot(result)
pred_emotion = encoder.inverse_transform(pred_labels.cpu()).flatten()

print("pred_level",pred_emotion[:10])

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 411972
  Batch size = 8


pred_level ['joy' 'joy' 'sadness' 'joy' 'joy' 'sadness' 'joy' 'anticipation' 'joy'
 'trust']


In [64]:
import pandas as pd

# transform into dataframe to submit to kaggle

sampleSubmission = pd.read_csv("sampleSubmission.csv")
sampleSubmission['emotion'] = pred_emotion

In [66]:
sampleSubmission.head()
sampleSubmission.to_csv('submission8.csv', index=False)

Unnamed: 0,id,emotion
0,0x2c7743,joy
1,0x2c1eed,joy
2,0x2826ea,sadness
3,0x356d9a,joy
4,0x20fd95,joy
