In [1]:
import pickle
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import re
import string
import numpy as np
import pandas as pd
import seaborn as sns
from nltk.corpus import words
from nltk.corpus import stopwords
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
df = pd.read_csv("tweet-sentiment-extraction/train.csv")
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [4]:
def clean_text(text):
    text = text.apply(lambda x: re.sub(r"https\S+", "", str(x)))
#     text = text.apply(lambda x: x.lower())
    text = text.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
#     stop_words = set(stopwords.words('english'))
#     stop_words.update(["#workingfromhome", "#wfh"])
#     text = text.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    return text

In [5]:
df['text'] = clean_text(df['text'])

In [6]:
sentiment_list = df["sentiment"].tolist()
label = []
for senti in sentiment_list:
    if senti == "neutral":
        label.append(1)
    elif senti == "negative":
        label.append(0)
    else:
        label.append(2)
df["labels"] = label

In [7]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'

In [8]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [9]:
class TweetDataset(Dataset):

    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
  
    def __len__(self):
        return len(self.reviews)
  
    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
          review,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          pad_to_max_length=True,
          return_attention_mask=True,
          return_tensors='pt',
        )

        return {
          'review_text': review,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'targets': torch.tensor(target, dtype=torch.long)
        }

In [10]:
df_train, df_val = train_test_split(df, test_size=0.3, random_state=42)

In [11]:
df_train

Unnamed: 0,textID,text,selected_text,sentiment,labels
23738,20c2ecb64f,Had a nice visit last night from a boy with vo...,Had a nice visit,positive,2
26930,2625f6b91b,never mind its closed sorry Miley,sorry,negative,0
9119,6f56e42577,I wont leave you alone until you accept my ap...,I won`t leave you alone until you accept my ap...,positive,2
25447,0c61206a16,not gonna ask what a boom boom is morning,not gonna ask what a boom boom is! morning.,neutral,1
25158,203a5f037e,Btw that song is youll always find ur way back...,Btw that song is you`ll always find ur way bac...,neutral,1
...,...,...,...,...,...
21575,442a264f77,STAR TREK WAS PURE AWESOME LOVE IT 3333 It wa...,STAR TREK WAS PURE AWESOME! LOVE IT!!! <3333 ...,positive,2
5390,c4994fd1f4,Will be going to Indiana Baptist Sunday Pray f...,"Will be going to Indiana Baptist Sunday, Pray ...",neutral,1
860,c0360fd4e9,is sitting thru the boring bits in Titanic wai...,is sitting thru the boring bits in Titanic wai...,neutral,1
15795,4c7e112f86,Missed the play,Missed the play,negative,0


In [12]:
file = open("sorted_wfh.pickle", "rb")
data = pickle.load(file)
sentiment_list = data["labels"].tolist()
label = []
for senti in sentiment_list:
    if senti == "neutral":
        label.append(1)
    elif senti == "negative":
        label.append(0)
    else:
        label.append(2)
data["labels"] = label
# data.rename(columns={'labels': 'sentiment'}, inplace=True)
data.rename(columns={'tweet_content': 'text'}, inplace=True)

df_test = data
len(df_test)

14832

In [13]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = TweetDataset(
        reviews=df.text.to_numpy(),
        targets=df.labels.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
      )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4
      )

In [14]:
BATCH_SIZE = 16
MAX_LEN = 160
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [15]:
class SentimentClassifier(nn.Module):

    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [16]:
model = SentimentClassifier(3)
# model.load_state_dict(torch.load('best_model_state.bin'))
model = model.to(device)

In [17]:
EPOCHS = 10

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [18]:
def train_epoch(
  model, 
  data_loader, 
  loss_fn, 
  optimizer, 
  device, 
  scheduler, 
  n_examples
):
    model = model.train()

    losses = []
    correct_predictions = 0
  
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(
          input_ids=input_ids,
          attention_mask=attention_mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,    
        loss_fn, 
        optimizer, 
        device, 
        scheduler, 
        len(df_train)
      )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn, 
        device, 
        len(df_val)
      )

    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state1.bin')
        best_accuracy = val_acc

Epoch 1/10
----------


In [None]:
# model = SentimentClassifier(3)
# model.load_state_dict(torch.load('best_model_state.bin'))
# model = model.to(device)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
              )
            _, preds = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
loss_fn = nn.CrossEntropyLoss().to(device)
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

# test_acc.item()

In [None]:
def get_predictions(model, data_loader):
    model = model.eval()
  
    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:

            texts = d["review_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
              )
            _, preds = torch.max(outputs, dim=1)

            probs = F.softmax(outputs, dim=1)

            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(probs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return review_texts, predictions, prediction_probs, real_values

In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
  model,
  test_data_loader
)

In [None]:
predictions = y_pred.tolist()
len(predictions)

In [None]:
labels = df_test["labels"]

In [None]:
df_test["bert_label"] = predictions

In [None]:
df_test["bert_label"].to_pickle("bert_pred.pickle")