## Sentiment Analysis

In [6]:
import os
import pandas as pd

train_df = pd.read_csv("./twitter_training.csv", names=["tweet_id", "entity", "sentiment", "content"])
train_df.head(5)

Unnamed: 0,tweet_id,entity,sentiment,content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [8]:
print(train_df['entity'].value_counts())

entity
TomClancysRainbowSix                 2400
MaddenNFL                            2400
Microsoft                            2400
LeagueOfLegends                      2394
CallOfDuty                           2394
Verizon                              2382
CallOfDutyBlackopsColdWar            2376
ApexLegends                          2376
Facebook                             2370
WorldOfCraft                         2364
Dota2                                2364
NBA2K                                2352
TomClancysGhostRecon                 2346
Battlefield                          2346
FIFA                                 2340
Xbox(Xseries)                        2334
Overwatch                            2334
johnson&johnson                      2328
Amazon                               2316
PlayStation5(PS5)                    2310
HomeDepot                            2310
Cyberpunk2077                        2304
CS-GO                                2304
GrandTheftAuto(GTA)        

In [10]:
print(train_df['sentiment'].value_counts() / train_df.shape[0])

sentiment
Negative      0.301840
Positive      0.278943
Neutral       0.245280
Irrelevant    0.173937
Name: count, dtype: float64


In [13]:
print(train_df.isna().sum(axis=0))
train_df = train_df.dropna()

tweet_id       0
entity         0
sentiment      0
content      686
dtype: int64


In [15]:
train_df['class'], classes = pd.factorize(train_df['sentiment'])
classes

Index(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype='object')

In [16]:
train_df.head(4)

Unnamed: 0,tweet_id,entity,sentiment,content,class
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,0
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,0
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,0
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,0


In [45]:
from transformers import BertTokenizerFast

tokeniser = BertTokenizerFast.from_pretrained('bert-base-uncased')
train_df['tokenised'] = train_df['content'].apply(lambda text : tokeniser(text, add_special_tokens=True)['input_ids'])

SyntaxError: incomplete input (3995604646.py, line 4)

In [46]:
tokeniser("Hello world, my name is peter paton", add_special_tokens=True)

{'input_ids': [101, 7592, 2088, 1010, 2026, 2171, 2003, 2848, 6986, 2239, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [38]:
train_df.head(5)

Unnamed: 0,tweet_id,entity,sentiment,content,class,tokenised
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,0,"[101, 10047, 2893, 2006, 3675, 8653, 1998, 104..."
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,0,"[101, 1045, 2572, 2746, 2000, 1996, 6645, 1998..."
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,0,"[101, 10047, 2893, 2006, 3675, 8653, 1998, 104..."
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,0,"[101, 10047, 2746, 2006, 3675, 8653, 1998, 104..."
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,0,"[101, 10047, 2893, 2006, 3675, 8653, 1016, 199..."


In [40]:
import jax

In [43]:
def sample(df : pd.DataFrame, batch_size : int):
    frequencies = 1.0 / df['class'].value_counts()
    weights = df['class'].map(frequencies)
    sample = df.sample(batch_size, replace = True, weights = frequencies)
    return process_sample(sample)

def process_sample(df : pd.DataFrame):
    inputs = df['tokenised']
    outputs = jax.nn.one_hot(df['class'].values, num_classes=4)
    return inputs, outputs

In [44]:
sample(train_df, 128)

(3    [101, 10047, 2746, 2006, 3675, 8653, 1998, 104...
 3    [101, 10047, 2746, 2006, 3675, 8653, 1998, 104...
 1    [101, 1045, 2572, 2746, 2000, 1996, 6645, 1998...
 1    [101, 1045, 2572, 2746, 2000, 1996, 6645, 1998...
 1    [101, 1045, 2572, 2746, 2000, 1996, 6645, 1998...
                            ...                        
 2    [101, 10047, 2893, 2006, 3675, 8653, 1998, 104...
 0    [101, 10047, 2893, 2006, 3675, 8653, 1998, 104...
 1    [101, 1045, 2572, 2746, 2000, 1996, 6645, 1998...
 0    [101, 10047, 2893, 2006, 3675, 8653, 1998, 104...
 1    [101, 1045, 2572, 2746, 2000, 1996, 6645, 1998...
 Name: tokenised, Length: 128, dtype: object,
 Array([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
