<a href="https://colab.research.google.com/github/Wardenclyff/NLP/blob/main/Senti_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m102.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m121.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m84.4 MB/s[0m eta [36m0:00

In [2]:
import os
import math

import torch
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, XLNetTokenizer, XLNetModel, XLNetLMHeadModel, XLNetConfig, AutoTokenizer, AutoModelForSequenceClassification
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
import matplotlib.pyplot as plt

import datetime
import time
import pytz

In [3]:
print("GPU Available: {}".format(torch.cuda.is_available()))
n_gpu = torch.cuda.device_count()
print("Number of GPU Available: {}".format(n_gpu))
print("GPU: {}".format(torch.cuda.get_device_name(0)))

GPU Available: True
Number of GPU Available: 1
GPU: NVIDIA A100-SXM4-40GB


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Import the Financial Phrasebank data as training data

train = pd.read_csv('/content/drive/My Drive/Senti/Financial_Phrasebank.csv', encoding="latin-1")

In [7]:
# Set test dataset

df = pd.read_csv('/content/drive/My Drive/Senti/twitter_100101-101230.csv', encoding='utf-8')

test = pd.DataFrame(df["Embedded_text"][:200])
test

Unnamed: 0,Embedded_text
0,and it hits me when i reach for you that i'm a...
1,I'm seeing my french guy on friday ;) and im n...
2,I'm listening to #GUCCIBANDANA ....Dont Ask Me...
3,I'll never change bcuz u dont like me for who ...
4,"""I wish you looked at me that way, ur beautifu..."
...,...
195,"this guy made me speechless, i dont iknow what..."
196,i am feeling like a superstar now...cuz pf ma ...
197,aite kids i'm going HOME to my BURGER!!! dont ...
198,on the bus.. im really nervous guys :( what if...


## XLNet Part

In [8]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

In [9]:
train_text_list = train["Text"].values
test_text_list = test["Embedded_text"].values

In [44]:
def tokenize_inputs(text_list, tokenizer, num_embeddings=512):
    # tokenize the text
    tokenized_texts = list(map(lambda t: tokenizer.tokenize(t)[:num_embeddings-2], text_list))
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    # add special tokens
    input_ids = [tokenizer.build_inputs_with_special_tokens(x) for x in input_ids]
    input_ids = pad_sequences(input_ids, maxlen=num_embeddings, dtype="long", truncating="post", padding="post")
    return input_ids

def create_attn_masks(input_ids):
    # Create attention masks
    attention_masks = []

    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    return attention_masks

In [11]:
# create input id tokens
train_input_ids = tokenize_inputs(train_text_list, tokenizer, num_embeddings=250)
test_input_ids = tokenize_inputs(test_text_list, tokenizer, num_embeddings=250)

# create attention masks
train_attention_masks = create_attn_masks(train_input_ids)
test_attention_masks = create_attn_masks(test_input_ids)

array([[  549,    22,    17, ...,     0,     0,     0],
       [20936,  5144,    23, ...,     0,     0,     0],
       [   18,   440,  2845, ...,     0,     0,     0],
       ...,
       [ 2028,  2336,   865, ...,     0,     0,     0],
       [ 1548,   820,    20, ...,     0,     0,     0],
       [  820,    25,    17, ...,     0,     0,     0]])

In [15]:
# add input ids and attention masks to the dataframe
train["features"] = train_input_ids.tolist()
train["masks"] = train_attention_masks

test["features"] = test_input_ids.tolist()
test["masks"] = test_attention_masks

In [16]:
# train valid split
train, valid = train_test_split(train, test_size=0.2, random_state=42)

In [17]:
X_train = train["features"].values.tolist()
X_valid = valid["features"].values.tolist()

train_masks = train["masks"].values.tolist()
valid_masks = valid["masks"].values.tolist()

label_cols = ["negative", "neutral", "positive"]
Y_train = train[label_cols].values.tolist()
Y_valid = valid[label_cols].values.tolist()

In [18]:
# Convert input ids and attention masks into torch tensors

X_train = torch.tensor(X_train)
X_valid = torch.tensor(X_valid)

Y_train = torch.tensor(Y_train, dtype=torch.float32)
Y_valid = torch.tensor(Y_valid, dtype=torch.float32)

train_masks = torch.tensor(train_masks, dtype=torch.long)
valid_masks = torch.tensor(valid_masks, dtype=torch.long)

In [19]:
# Select a batch size for training
batch_size = 32

# Create an iterator of data with torch DataLoade
train_data = TensorDataset(X_train, train_masks, Y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,\
                              sampler=train_sampler,\
                              batch_size=batch_size)

validation_data = TensorDataset(X_valid, valid_masks, Y_valid)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data,\
                                   sampler=validation_sampler,\
                                   batch_size=batch_size)

In [20]:
def train(model, num_epochs,\
          optimizer,\
          train_dataloader, valid_dataloader,\
          model_save_path,\
          train_loss_set=[], valid_loss_set = [],\
          lowest_eval_loss=None, start_epoch=0,\
          device="cpu"
          ):
  model.to(device)

  # trange is a tqdm wrapper around the normal python range
  for i in trange(num_epochs, desc="Epoch"):
    actual_epoch = start_epoch + i

    # Training
    model.train()
    tr_loss = 0
    num_train_samples = 0

    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
      batch = tuple(t.to(device) for t in batch)
      b_input_ids, b_input_mask, b_labels = batch
      optimizer.zero_grad()
      loss = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
      tr_loss += loss.item()
      num_train_samples += b_labels.size(0)
      loss.backward()
      optimizer.step()

    # Update tracking variables
    epoch_train_loss = tr_loss/num_train_samples
    train_loss_set.append(epoch_train_loss)

    print("Train loss: {}".format(epoch_train_loss))

    # Validation
    model.eval()

    # Tracking variables
    eval_loss = 0
    num_eval_samples = 0

    # Evaluate data for one epoch
    for batch in valid_dataloader:
      batch = tuple(t.to(device) for t in batch)
      b_input_ids, b_input_mask, b_labels = batch

      with torch.no_grad():
        loss = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        eval_loss += loss.item()
        num_eval_samples += b_labels.size(0)

    epoch_eval_loss = eval_loss/num_eval_samples
    valid_loss_set.append(epoch_eval_loss)

    print("Valid loss: {}".format(epoch_eval_loss))

    if lowest_eval_loss == None:
      lowest_eval_loss = epoch_eval_loss
      # save model
      save_model(model, model_save_path, actual_epoch,\
                 lowest_eval_loss, train_loss_set, valid_loss_set)
    else:
      if epoch_eval_loss < lowest_eval_loss:
        lowest_eval_loss = epoch_eval_loss
        # save model
        save_model(model, model_save_path, actual_epoch,\
                   lowest_eval_loss, train_loss_set, valid_loss_set)
    print("\n")

  return model, train_loss_set, valid_loss_set


def save_model(model, save_path, epochs, lowest_eval_loss, train_loss_hist, valid_loss_hist):
  model_to_save = model.module if hasattr(model, 'module') else model
  checkpoint = {'epochs': epochs, \
                'lowest_eval_loss': lowest_eval_loss,\
                'state_dict': model_to_save.state_dict(),\
                'train_loss_hist': train_loss_hist,\
                'valid_loss_hist': valid_loss_hist
               }
  torch.save(checkpoint, save_path)
  print("Saving model at epoch {} with validation loss of {}".format(epochs,\
                                     lowest_eval_loss))
  return

def load_model(save_path):
  checkpoint = torch.load(save_path)
  model_state_dict = checkpoint['state_dict']
  model = XLNetForMultiLabelSequenceClassification(num_labels=model_state_dict["classifier.weight"].size()[0])
  model.load_state_dict(model_state_dict)

  epochs = checkpoint["epochs"]
  lowest_eval_loss = checkpoint["lowest_eval_loss"]
  train_loss_hist = checkpoint["train_loss_hist"]
  valid_loss_hist = checkpoint["valid_loss_hist"]

  return model, epochs, lowest_eval_loss, train_loss_hist, valid_loss_hist

In [21]:
torch.cuda.empty_cache()

In [22]:
#config = XLNetConfig()

class XLNetForMultiLabelSequenceClassification(torch.nn.Module):

  def __init__(self, num_labels=2):
    super(XLNetForMultiLabelSequenceClassification, self).__init__()
    self.num_labels = num_labels
    self.xlnet = XLNetModel.from_pretrained('xlnet-base-cased')
    self.classifier = torch.nn.Linear(768, num_labels)

    torch.nn.init.xavier_normal_(self.classifier.weight)

  def forward(self, input_ids, token_type_ids=None,\
              attention_mask=None, labels=None):
    # last hidden layer
    last_hidden_state = self.xlnet(input_ids=input_ids,\
                                   attention_mask=attention_mask,\
                                   token_type_ids=token_type_ids)
    # pool the outputs into a mean vector
    mean_last_hidden_state = self.pool_hidden_state(last_hidden_state)
    logits = self.classifier(mean_last_hidden_state)

    if labels is not None:
      loss_fct = BCEWithLogitsLoss()
      loss = loss_fct(logits.view(-1, self.num_labels),\
                      labels.view(-1, self.num_labels))
      return loss
    else:
      return logits

  def freeze_xlnet_decoder(self):
    for param in self.xlnet.parameters():
      param.requires_grad = False

  def unfreeze_xlnet_decoder(self):
    for param in self.xlnet.parameters():
      param.requires_grad = True

  def pool_hidden_state(self, last_hidden_state):
    last_hidden_state = last_hidden_state[0]
    mean_last_hidden_state = torch.mean(last_hidden_state, 1)
    return mean_last_hidden_state

model = XLNetForMultiLabelSequenceClassification(num_labels=len(Y_train[0]))


Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01, correct_bias=False)



In [24]:
num_epochs=1

cwd = os.getcwd()
model_save_path = output_model_file = os.path.join(cwd, "drive/My Drive/Senti/xlnet_model.bin")
model, train_loss_set, valid_loss_set = train(model=model,\
                                              num_epochs=num_epochs,\
                                              optimizer=optimizer,\
                                              train_dataloader=train_dataloader,\
                                              valid_dataloader=validation_dataloader,\
                                              model_save_path=model_save_path,\
                                              device="cuda")

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Train loss: 0.014188663179095224
Valid loss: 0.008189612596305375


Epoch: 100%|██████████| 1/1 [01:19<00:00, 79.78s/it]

Saving model at epoch 0 with validation loss of 0.008189612596305375







In [25]:
def generate_predictions(model, df, num_labels, device="cpu", batch_size=3):
  num_iter = math.ceil(df.shape[0]/batch_size)

  pred_probs = np.array([]).reshape(0, num_labels)

  model.to(device)
  model.eval()

  for i in range(num_iter):
    df_subset = df.iloc[i*batch_size:(i+1)*batch_size,:]
    X = df_subset["features"].values.tolist()
    masks = df_subset["masks"].values.tolist()
    X = torch.tensor(X)
    masks = torch.tensor(masks, dtype=torch.long)
    X = X.to(device)
    masks = masks.to(device)
    with torch.no_grad():
      logits = model(input_ids=X, attention_mask=masks)
      logits = logits.sigmoid().detach().cpu().numpy()
      pred_probs = np.vstack([pred_probs, logits])

  return pred_probs

In [45]:
num_labels = len(label_cols)
pred_probs = generate_predictions(model, test, num_labels, device="cuda", batch_size=16)

AttributeError: ignored

In [27]:
label_cols = ["XLN_negative", "XLN_neutral", "XLN_positive"]

test["XLN_negative"] = pred_probs[:,0]
test["XLN_neutral"] = pred_probs[:,1]
test["XLN_positive"] = pred_probs[:,2]

In [28]:
df_xlnet = test[["Embedded_text", "XLN_negative", "XLN_neutral", "XLN_positive"]]
df_xlnet

Unnamed: 0,Embedded_text,XLN_negative,XLN_neutral,XLN_positive
0,and it hits me when i reach for you that i'm a...,0.478152,0.669167,0.033015
1,I'm seeing my french guy on friday ;) and im n...,0.061659,0.883046,0.051235
2,I'm listening to #GUCCIBANDANA ....Dont Ask Me...,0.034729,0.925783,0.036098
3,I'll never change bcuz u dont like me for who ...,0.298190,0.786546,0.044727
4,"""I wish you looked at me that way, ur beautifu...",0.082206,0.883998,0.038450
...,...,...,...,...
195,"this guy made me speechless, i dont iknow what...",0.492465,0.701316,0.032945
196,i am feeling like a superstar now...cuz pf ma ...,0.012753,0.385704,0.650253
197,aite kids i'm going HOME to my BURGER!!! dont ...,0.060768,0.654194,0.214358
198,on the bus.. im really nervous guys :( what if...,0.394349,0.750177,0.042292


## FinBert Part

In [29]:
text_array = np.array(df_xlnet)

text_list = list(text_array[:,0])

print(text_list)

["and it hits me when i reach for you that i'm afraid you won't be there. maybe i am in too deep, but i dont care...", "I'm seeing my french guy on friday ;) and im nervous. i think i am going to kiss him. if my guts dont tell me otherwise.", "I'm listening to #GUCCIBANDANA ....Dont Ask Me Why...i just Am...LOL #AYEEEE Gucci bandana Gucci Gucci Bandana ...#AYEEEEEE", "I'll never change bcuz u dont like me for who I am n u just want me to be something I'll never be. I'm me :) remember that", '"I wish you looked at me that way, ur beautiful eyes looking deep into mine, u dont even know i\'m alive. baby 2 u all i am is invisible"', "I guess i'm bitchy today but i've taken like8 off following list.  i figured you dont even know who i am and ur boring or anoy me so u out!!", "Rob oh rob,i tell you were i am but u DONT FIND ME! okay i'm gonna stalk him tomorrow! *gets out maps of london*", "SHIT ME. I'M ALREADY SPAZZING AT A PICTURE. I AM DEADDD. DEADDDDD. NO JAIL. I DONT BELONG THERE.", "RT

In [30]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

Downloading (…)okenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [32]:
# predict the sentiment
inputs = tokenizer(text_list, padding = True, truncation = True, return_tensors='pt')
outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

torch.Size([200, 3])


In [34]:
#Model classes
model.config.id2label

{0: 'positive', 1: 'negative', 2: 'neutral'}

In [41]:
positive = predictions[:, 0].tolist()
negative = predictions[:, 1].tolist()
neutral = predictions[:, 2].tolist()

table = {'Text':text_list,
         "FinB_positive":positive,
         "FinB_negative":negative,
         "FinB_neutral":neutral}

df_finb = pd.DataFrame(table, columns = ["FinB_negative", "FinB_neutral", "FinB_negative"])

df_finb

Unnamed: 0,FinB_negative,FinB_neutral,FinB_negative.1
0,0.405493,0.548583,0.405493
1,0.119632,0.828465,0.119632
2,0.059345,0.911080,0.059345
3,0.044203,0.911970,0.044203
4,0.057241,0.914204,0.057241
...,...,...,...
195,0.440399,0.504413,0.440399
196,0.030660,0.905130,0.030660
197,0.155409,0.789231,0.155409
198,0.298283,0.674972,0.298283


## Sentiment Result

In [42]:
df_result = pd.concat([df_xlnet, df_finb], axis=1)
df_result

Unnamed: 0,Embedded_text,XLN_negative,XLN_neutral,XLN_positive,FinB_negative,FinB_neutral,FinB_negative.1
0,and it hits me when i reach for you that i'm a...,0.478152,0.669167,0.033015,0.405493,0.548583,0.405493
1,I'm seeing my french guy on friday ;) and im n...,0.061659,0.883046,0.051235,0.119632,0.828465,0.119632
2,I'm listening to #GUCCIBANDANA ....Dont Ask Me...,0.034729,0.925783,0.036098,0.059345,0.911080,0.059345
3,I'll never change bcuz u dont like me for who ...,0.298190,0.786546,0.044727,0.044203,0.911970,0.044203
4,"""I wish you looked at me that way, ur beautifu...",0.082206,0.883998,0.038450,0.057241,0.914204,0.057241
...,...,...,...,...,...,...,...
195,"this guy made me speechless, i dont iknow what...",0.492465,0.701316,0.032945,0.440399,0.504413,0.440399
196,i am feeling like a superstar now...cuz pf ma ...,0.012753,0.385704,0.650253,0.030660,0.905130,0.030660
197,aite kids i'm going HOME to my BURGER!!! dont ...,0.060768,0.654194,0.214358,0.155409,0.789231,0.155409
198,on the bus.. im really nervous guys :( what if...,0.394349,0.750177,0.042292,0.298283,0.674972,0.298283


In [None]:
df_result.to_csv('/content/drive/My Drive/Senti/Sentiment.csv')