In [200]:
import pandas as pd
import string
import ast
import numpy as np
from sklearn.model_selection import train_test_split

In [201]:
df = pd.read_csv(r"dataset\data\readyds\newds.csv")

In [202]:
df.isna().sum()

text                0
class               0
id                  0
class list          0
class length        0
emotion             0
clean text        106
emotion vector      0
dtype: int64

In [203]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54262 entries, 0 to 54261
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            54262 non-null  object
 1   class           54262 non-null  object
 2   id              54262 non-null  object
 3   class list      54262 non-null  object
 4   class length    54262 non-null  int64 
 5   emotion         54262 non-null  object
 6   clean text      54156 non-null  object
 7   emotion vector  54262 non-null  object
dtypes: int64(1), object(7)
memory usage: 3.3+ MB


In [204]:
df = df.dropna(subset=['clean text'])

In [205]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54156 entries, 0 to 54261
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            54156 non-null  object
 1   class           54156 non-null  object
 2   id              54156 non-null  object
 3   class list      54156 non-null  object
 4   class length    54156 non-null  int64 
 5   emotion         54156 non-null  object
 6   clean text      54156 non-null  object
 7   emotion vector  54156 non-null  object
dtypes: int64(1), object(7)
memory usage: 3.7+ MB


## Type Conversion

In [206]:
df['text'] = df['text'].astype('string')
df['id'] = df['id'].astype('string')
df['clean text'] = df['clean text'].astype('string')

In [207]:
def fix_val(x):
    if isinstance(x, str):
        return ast.literal_eval(x)
    else:
        return x

In [208]:
df['class list'] = df['class list'].apply(fix_val)
df['emotion'] = df['emotion'].apply(fix_val)
df['emotion vector'] = df['emotion vector'].apply(fix_val)

In [209]:
df['emotion vector array'] = df['emotion vector'].apply(lambda x: np.array(x, dtype=np.float32))

In [210]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54156 entries, 0 to 54261
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   text                  54156 non-null  string
 1   class                 54156 non-null  object
 2   id                    54156 non-null  string
 3   class list            54156 non-null  object
 4   class length          54156 non-null  int64 
 5   emotion               54156 non-null  object
 6   clean text            54156 non-null  string
 7   emotion vector        54156 non-null  object
 8   emotion vector array  54156 non-null  object
dtypes: int64(1), object(5), string(3)
memory usage: 4.1+ MB


## Stratify Sampling for Train/Valid/Test

In [211]:
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['class length'], random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['class length'], random_state=42)

In [212]:
train_df.shape

(43324, 9)

In [213]:
valid_df.shape

(5416, 9)

In [214]:
test_df.shape

(5416, 9)

In [215]:
train_df.head(15)

Unnamed: 0,text,class,id,class list,class length,emotion,clean text,emotion vector,emotion vector array
49714,"> no money at what normal, un-secterian people...",27,edkt3gv,[27],1,[neutral],money normal unsecterian people want surely se...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
12625,Interesting to see Dashy and Crimsix change sp...,27,edlho9e,[27],1,[neutral],interesting see dashy crimsix change specialis...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
24630,Trouser maidens 😂 omg I love it.,18,eeuoy8g,[18],1,[love],trouser maiden 😂 omg love,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
10968,"Well shit, then it's just like [NAME] wouldn't...",4,edzd6xv,[4],1,[approval],well shit like name wouldnt let complete math ...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
27457,Was there for a game v the lions in in 2016- W...,27,edm22m6,"[0, 27]",2,"[admiration, neutral]",game v lion 2016 well worth,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
42452,[NAME] is happy,0,eel6xov,[0],1,[admiration],name happy,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4190,Doubt that’s what is making or breaking the si...,27,edlxtaa,[27],1,[neutral],doubt that’s making breaking situation either ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
27962,Tomorrow at 5-7pm Utc it says for me,27,efdwsgh,[27],1,[neutral],tomorrow 57pm utc say,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
50429,... I don't feel like your comment is from an ...,1015,edqdi7q,"[10, 15]",2,"[disapproval, gratitude]",dont feel like comment objective point view th...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
20913,What a tool!,0,ediok8u,[0],1,[admiration],tool,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [216]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43324 entries, 49714 to 45807
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   text                  43324 non-null  string
 1   class                 43324 non-null  object
 2   id                    43324 non-null  string
 3   class list            43324 non-null  object
 4   class length          43324 non-null  int64 
 5   emotion               43324 non-null  object
 6   clean text            43324 non-null  string
 7   emotion vector        43324 non-null  object
 8   emotion vector array  43324 non-null  object
dtypes: int64(1), object(5), string(3)
memory usage: 3.3+ MB


In [217]:
with open(r'dataset\data\emotions.txt', 'r') as f:
    emo_list = f.read().splitlines()

## DistilBERT Fine Tuning

In [218]:
from transformers import DistilBertTokenizer, TFDistilBertModel
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

In [219]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
MAX_LEN = 128

In [220]:
def tokenize_data(texts):
    return tokenizer(
        list(texts), max_length=MAX_LEN, padding='max_length',
        truncation=True, return_tensors="tf"
    )

In [221]:
train_tokens = tokenize_data(train_df['clean text'])
val_tokens = tokenize_data(valid_df['clean text'])

In [222]:
bert_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
input_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(MAX_LEN,), dtype=tf.int32, name="attention_mask")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [223]:
output = bert_model(input_ids, attention_mask=attention_mask)[0]
x = GlobalAveragePooling1D()(output)
x = Dense(128, activation='relu')(x)
output = Dense(len(emo_list), activation='sigmoid')(x)

In [224]:
model = Model(inputs=[input_ids, attention_mask], outputs=output)
model.compile(optimizer=Adam(3e-5), loss='binary_crossentropy', metrics=['accuracy'])

In [225]:
model.fit(
    x={"input_ids": train_tokens['input_ids'], "attention_mask": train_tokens['attention_mask']},
    y=tf.convert_to_tensor(train_df['emotion vector'].tolist()),
    validation_data=(
        {"input_ids": val_tokens['input_ids'], "attention_mask": val_tokens['attention_mask']},
        tf.convert_to_tensor(valid_df['emotion vector'].tolist())
    ),
    epochs=5, batch_size=32
)

Epoch 1/5
Epoch 2/5

KeyboardInterrupt: 