## Sentiment Analysis

In [2]:
import os
import pandas as pd

train_df = pd.read_csv("./twitter_training.csv", names=["tweet_id", "entity", "sentiment", "content"])
train_df.head(5)

Unnamed: 0,tweet_id,entity,sentiment,content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [3]:
print(train_df['entity'].value_counts())

entity
TomClancysRainbowSix                 2400
MaddenNFL                            2400
Microsoft                            2400
LeagueOfLegends                      2394
CallOfDuty                           2394
Verizon                              2382
CallOfDutyBlackopsColdWar            2376
ApexLegends                          2376
Facebook                             2370
WorldOfCraft                         2364
Dota2                                2364
NBA2K                                2352
TomClancysGhostRecon                 2346
Battlefield                          2346
FIFA                                 2340
Xbox(Xseries)                        2334
Overwatch                            2334
johnson&johnson                      2328
Amazon                               2316
PlayStation5(PS5)                    2310
HomeDepot                            2310
Cyberpunk2077                        2304
CS-GO                                2304
GrandTheftAuto(GTA)        

In [4]:
print(train_df['sentiment'].value_counts() / train_df.shape[0])

sentiment
Negative      0.301840
Positive      0.278943
Neutral       0.245280
Irrelevant    0.173937
Name: count, dtype: float64


In [5]:
print(train_df.isna().sum(axis=0))
train_df = train_df.dropna()

tweet_id       0
entity         0
sentiment      0
content      686
dtype: int64


In [6]:
train_df['class'], classes = pd.factorize(train_df['sentiment'])
classes

Index(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype='object')

In [7]:
train_df.head(4)

Unnamed: 0,tweet_id,entity,sentiment,content,class
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,0
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,0
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,0
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,0


In [11]:
train_df.head(5)

Unnamed: 0,tweet_id,entity,sentiment,content,class
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,0
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,0
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,0
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,0
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,0


In [79]:
import jax
import jax.numpy as jnp
import flax.linen as nn 
from jax.nn.initializers import lecun_normal
from jax import value_and_grad
import optax

from transformers import BertTokenizerFast, FlaxBertModel

In [80]:
embedding_model = FlaxBertModel.from_pretrained('bert-base-cased')
tokeniser = BertTokenizerFast.from_pretrained('bert-base-cased')

def sample(df : pd.DataFrame, batch_size : int = 128):
    frequencies = 1.0 / df['class'].value_counts()
    weights = df['class'].map(frequencies)
    sample = df.sample(batch_size, replace = True, weights = frequencies)
    return process_sample(sample)

def process_sample(sample_df : pd.DataFrame):
    inputs = tokeniser.batch_encode_plus(sample_df['content'].tolist(), add_special_tokens=True, truncation=True, padding=True, return_tensors='jax')
    outputs = jax.nn.one_hot(sample_df['class'].values, num_classes=4)
    return embedding_model(**inputs).last_hidden_state, outputs

Some weights of FlaxBertModel were not initialized from the model checkpoint at bert-base-cased and are newly initialized: {('pooler', 'dense', 'bias'), ('pooler', 'dense', 'kernel')}
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [81]:
example_input, example_output = sample(train_df, batch_size = 128)
example_input.shape

(128, 15, 768)

In [82]:
print(example_input.dtype)

float32


In [83]:
class Model(nn.Module):

    @nn.compact
    def __call__(self, embedding):
        x = nn.Dense(features = 128, kernel_init = lecun_normal())(embedding)
        x = nn.relu(x)
        x = jnp.mean(x, axis=-1)
        x = nn.Dense(features = 128, kernel_init = lecun_normal())(x)
        x = nn.relu(x)
        x = nn.Dense(features = 4, kernel_init = lecun_normal())(x)
        return nn.softmax(x)

In [86]:
model = Model()

rng, init_rng = jax.random.split(jax.random.PRNGKey(42), 2)
params = model.init(init_rng, example_input)
model.apply(params, example_input).shape

(128, 4)

In [88]:
optimiser = optax.chain(optax.clip(1.0), optax.adam(learning_rate=1e-4))
optimiser_state = optimiser.init(params)

In [90]:
def cross_entropy_loss(params, batch_inputs, batch_outputs):
    model_outputs = model.apply(params, batch_inputs)
    return -jnp.mean(jnp.sum(batch_outputs * jnp.log(model_outputs), axis = 1))

In [92]:
num_epochs = 1
batch_size = 128

for epoch in range(num_epochs):
    batch_losses = []
    for batch in range(train_df.shape[0] // batch_size):
        batch_inputs, batch_outputs = sample(train_df, batch_size=128)
        loss, grads = jax.value_and_grad(cross_entropy_loss)(params, batch_inputs, batch_outputs)
        updates, optimiser_state = optimiser.update(grads, optimiser_state)
        params = optax.apply_updates(params, updates)
        batch_losses.append(loss)
    print(jnp.mean(jnp.array(batch_losses)))

KeyboardInterrupt: 