In [13]:
import pandas as pd 

import matplotlib.pyplot as plt
%matplotlib inline

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
df=pd.read_csv("dataset.csv",encoding="ISO-8859-1",names=["target", "ids", "date", "flag", "user", "text"])
#iso encoding used when data includes accented characters 


Dataset size: 1600000


In [5]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Salma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
stopwords= stopwords.words("english")
lm= WordNetLemmatizer()

## Preprocessing 

In [16]:
def preprocess(text):
    #remove links,mentions of user and special characters with regex
    text= re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+",' ', str(text).lower()).strip()
    tokens=[]
    for token in text.split():
        if token not in stopwords:
            tokens.append(lm.lemmatize(token))
    return " ".join(tokens)

  text= re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+",' ', str(text).lower()).strip()


In [17]:
df.text= df.text.apply(lambda x: preprocess(x))

In [None]:
label_map={0:0 ,2:1, 4:2} #some pytorch classification models expect labels to go from 0 to .. consecutively
df['target']=df['target'].map(label_map)

## Tokenizing

In [None]:
from transformers import AutoTokenizer
#load bertweet tokenizer
tokenizer=AutoTokenizer.from_pretrained("vinai/bertweet-base",use_fast=True)



config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [20]:
encodings =tokenizer(list(df['text']),truncation=True,padding=True,max_length=128)

## Convert Pytorch Dataset
pytorch models require data in the form of tensors not raw text or lists 

In [22]:
import torch

class TweetDataset(torch.utils.data.Dataset):
    def __init__(self,encodings,labels):
        self.encodings= encodings
        self.labels= labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self,idx):
        item= {key: torch.tensor(val[idx]) for key,val in self.encodings.items()}
        item['labels']= torch.tensor(self.labels[idx])
        return item

dataset= TweetDataset(encodings, df['target'].tolist())

## Train & Validation split

In [26]:
from torch.utils.data import DataLoader,random_split

train_size= int(0.8*len(dataset))
val_size= len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset,[train_size,val_size])

#dataloader converts dataset into batches there is an option to shuffle (makes training more robust)
train_loader= DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader= DataLoader(val_dataset, batch_size=16)

## Load Bertweet

In [27]:
from transformers import AutoModelForSequenceClassification

model= AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=3)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training

In [33]:
from torch.optim import AdamW
from tqdm import tqdm #for progress bars

device = torch.device('cuda')
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(4):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        #forward pass 
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        #back propagation
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

AssertionError: Torch not compiled with CUDA enabled

## Evaluation 

In [None]:
from sklearn.metrics import accuracy_score, classification_report

model.eval() #switching model to evaluation mode 
preds, true_labels = [], []

with torch.no_grad(): #no gradients are computed during evaluation
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        #forward pass to get predictions 
        outputs = model(input_ids, attention_mask=attention_mask)
        preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy()) #picks the class with the highest prob for each sample
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, preds)
print("Validation Accuracy:", accuracy)
print(classification_report(true_labels, preds, target_names=['Negative','Neutral','Positive']))
