In [1]:
import numpy as np 
import pandas as pd 

In [4]:
df = pd.read_csv('dataset.csv').query('Language == "en"')
df.head()

Unnamed: 0,Text,Language,Label
0,@Charlie_Corley @Kristine1G @amyklobuchar @Sty...,en,litigious
2,https://t.co/YJNiO0p1JV Flagstar Bank disclose...,en,litigious
3,Rwanda is set to host the headquarters of Unit...,en,positive
4,OOPS. I typed her name incorrectly (today’s br...,en,litigious
5,It sucks for me since I'm focused on the natur...,en,negative


In [5]:
df.count()

Text        871310
Language    871310
Label       871310
dtype: int64

In [7]:
df.groupby(['Label']).size()

Label
litigious      180062
negative       244146
positive       248516
uncertainty    198586
dtype: int64

In [9]:
import re

In [13]:
def remove_emoji(string):
        emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', string) 


def clean_tweet(tweet):
    if type(tweet) == np.float64:
        return ""
    temp = tweet.lower()
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#","", temp)
    temp = remove_emoji(temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    
    return temp


In [14]:
df['Text'] = df['Text'].map(lambda x : clean_tweet(x))

In [19]:
df.head()

Unnamed: 0,Text,Language,Label
0,testimony is not evidence in a court of la...,en,litigious
2,flagstar bank discloses a data breach that im...,en,litigious
3,rwanda is set to host the headquarters of unit...,en,positive
4,oops i typed her name incorrectly today s br...,en,litigious
5,it sucks for me since im focused on the nature...,en,negative


In [24]:
filtered_df = df[~((df['Label'] == 'litigious'))]

In [25]:
temp_df = df

In [26]:
df.drop(df[df['Label'] == 'litigious'].index, inplace=True)

In [28]:
df.head()

Unnamed: 0,Text,Language,Label
3,rwanda is set to host the headquarters of unit...,en,positive
5,it sucks for me since im focused on the nature...,en,negative
7,you can also relate this to art too a lot...,en,uncertainty
8,social security constant political crises dis...,en,negative
9,a broken rib can puncture a lung or lead to a...,en,negative


In [31]:
df.groupby(['Label']).size()

Label
negative       244146
positive       248516
uncertainty    198586
dtype: int64

In [34]:
data_types = df['Label'].unique()
data_types

array(['positive', 'negative', 'uncertainty'], dtype=object)

In [36]:
min_samples = min(df['Label'].value_counts())
min_samples

198586

In [37]:
undersampled_dfs = []
for data_type in data_types:
    subset = df[df['Label'] == data_type].sample(min_samples, random_state=42)
    undersampled_dfs.append(subset)

In [38]:
undersampled_df = pd.concat(undersampled_dfs)
undersampled_df.head()

Unnamed: 0,Text,Language,Label
430988,i wouldnt dream of it just dont tell me stop,en,positive
492831,question for sec fans if ohio state pl...,en,positive
76161,would be a dream come true to join,en,positive
744328,you know whats even easier admitting y...,en,positive
673077,people asking where is sapnap dude he s figh...,en,positive


In [39]:
from transformers import BertTokenizer

In [40]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {'positive':0,
          'negative':1,
          'uncertainty':2
          }

In [43]:
undersampled_df.to_csv("clean_df.csv")

In [41]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

In [42]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, undersampled_df):

        self.labels = [labels[label] for label in undersampled_df['Label']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in undersampled_df['Text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [None]:
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))