In [None]:
# Sai, Ravali
# MOST of this code came straight from ChronoBERT Amy Olex who used the 
# Fine-Tuning BERT Tutorial by Chris McCormick at
# https://mccormickml.com/2019/07/22/BERT-fine-tuning/



!pip install transformers
!pip install utils

import argparse
import tensorflow as tf
import torch
import sklearn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from sklearn.metrics import classification_report
from transformers import BertTokenizer, get_linear_schedule_with_warmup, AutoTokenizer
from transformers import AdamW, BertForSequenceClassification, AutoModelForSequenceClassification
from transformers import BertModel, BertConfig
from tqdm import tqdm, trange
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils
from math import floor
from sklearn.metrics import matthews_corrcoef
import seaborn as sns
import os
import io
from google.colab import auth
from googleapiclient.discovery import build
from io import FileIO
from googleapiclient.http import MediaIoBaseDownload
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import EarlyStoppingCallback
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier


Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 4.3 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 33.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 39.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 450 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [None]:
#  Get the files from the google drive
auth.authenticate_user()
drive_service = build('drive', 'v3')

# Get train data file
file_id = '11-ZRHPVBWMhMdBXEqjBqVPdTvaZOLLxc'  # Training file on the Google Drive
downloaded = io.FileIO("Humor_Train_Data.csv", 'w')
request = drive_service.files().get_media(fileId=file_id)
downloader = MediaIoBaseDownload(downloaded, request)
done = False
while done is False:
  status, done = downloader.next_chunk()
  print("Download {}%.".format(int(status.progress() * 100)))

# Get test data file
file_id = '140hmlr6pZhZXKTgnCkMOnMDsjGdgRX1a'  # Training file on the Google Drive
downloaded = io.FileIO("Humor_CorrectEstimation_Data.csv", 'w')
request = drive_service.files().get_media(fileId=file_id)
downloader = MediaIoBaseDownload(downloaded, request)
done = False
while done is False:
  status, done = downloader.next_chunk()
  print("Download {}%.".format(int(status.progress() * 100)))



In [None]:
# set the hyperparameters
batch_size = 32
max_length = 256
epochs = 10

#  load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

#  load model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

#  set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
# Load the training and test data
train_df = pd.read_csv("Humor_Train_Data.csv", delimiter=',', header=None, names=['Humor', 'Text'], )
test_df = pd.read_csv("Humor_CorrectEstimation_Data.csv", delimiter=',', header=None, names=['Humor', 'Text'], encoding = 'unicode_escape')

# Create sentence and label lists
train_instances = train_df.Text.values
test_instances = test_df.Text.values

#  Conver the labels to numeric values
test_df.Humor.replace({"no": 0, "yes": 1}, inplace=True)
train_df.Humor.replace({"no": 0, "yes": 1}, inplace=True)

#  get the label lists
train_Humor = train_df.Humor.values
test_Humor = test_df.Humor.values

train_df.head()



Unnamed: 0,Humor,Text
0,0,"Joe biden rules out 2020 bid: 'guys, i'm not r..."
1,0,Watch: darvish gave hitter whiplash with slow ...
2,1,What do you call a turtle without its shell? d...
3,0,5 reasons the 2016 election feels so personal
4,0,"Pasco police shot mexican migrant from behind,..."


In [None]:
test_df.head()


Unnamed: 0,Humor,Text
0,1,What kind of cat should you take into the des...
1,1,Remember when people used to have to be in sha...
2,1,Pizza is always good. - everyone we'll see abo...
3,1,"What's 6 inches long hard, bent, and in my pan..."
4,0,Black teen's response to violence in his commu...


In [None]:
#  Set training data
print("Number of training instances: " + str(len(train_instances)))
    
# Get maximum length of sentences
max_length = 0
# For every sentence...
for sent in train_instances:
  # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
  input_ids = tokenizer.encode(sent, add_special_tokens=True)
  # Update the maximum sentence length.
  max_length = max(max_length, len(input_ids))

train_inputs = [] #index_tensor
train_masks = [] #attention tensor
train_text = [] #tokenized text

# For every sentence...
for sent in train_instances:
  encoded_dict = tokenizer.encode_plus(
       sent,  # Sentence to encode.
       add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
       max_length=max_length,  # Pad & truncate all sentences.
       pad_to_max_length=True,
       return_attention_mask=True,  # Construct attn. masks.
       return_tensors='pt',  # Return pytorch tensors.
       )

  # Add the encoded sentence to the list.
  train_inputs.append(encoded_dict['input_ids'])
  train_masks.append(encoded_dict['attention_mask'])
  train_text.append(tokenizer.tokenize(tokenizer.decode(encoded_dict['input_ids'].tolist()[0])))


#  Convert to tensors
train_inputs = torch.stack(train_inputs).squeeze()
train_Humor = torch.tensor(train_Humor)
train_masks = torch.stack(train_masks).squeeze()

#checking on my shapes
#print(train_inputs.shape)
#print(train_labels.shape)
#print(train_masks.shape)

train_data = TensorDataset(train_inputs, train_masks, train_Humor)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=16)





Number of training instances: 634


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
#  Set testing data
print("Number of testing instances: " + str(len(test_instances)))
    
# Get maximum length of sentences
max_length = 0
# For every sentence...
for sent in test_instances:
  # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
  input_ids = tokenizer.encode(sent, add_special_tokens=True)
  # Update the maximum sentence length.
  max_length = max(max_length, len(input_ids))

test_inputs = [] #index_tensor
test_masks = [] #attention tensor
test_text = [] #tokenized text

# For every sentence...
for sent in test_instances:
  encoded_dict = tokenizer.encode_plus(
       sent,  # Sentence to encode.
       add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
       max_length=max_length,  # Pad & truncate all sentences.
       pad_to_max_length=True,
       return_attention_mask=True,  # Construct attn. masks.
       return_tensors='pt',  # Return pytorch tensors.
       )

  # Add the encoded sentence to the list.
  test_inputs.append(encoded_dict['input_ids'])
  test_masks.append(encoded_dict['attention_mask'])
  test_text.append(tokenizer.tokenize(tokenizer.decode(encoded_dict['input_ids'].tolist()[0])))

  #  Convert to tensors
test_inputs = torch.stack(test_inputs).squeeze()
test_Humor = torch.tensor(test_Humor)
test_masks = torch.stack(test_masks).squeeze()

test_data = TensorDataset(test_inputs, test_masks, test_Humor)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=16)


Number of testing instances: 38757




In [None]:
#  Train the model

# set parameters
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
num_training_steps = len(train_dataloader) * epochs
num_warmup_steps = 0
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps,
                                                num_training_steps=num_training_steps)

train_loss_set = []
training_stats = []

# trange is a_bert tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  global model
  print("Epoch: ", _)
  print("run training")

  #train the model
  model.train()
       
  # Total loss for this epoch.
  tl_set = []
  total_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0

  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    print("Step: " + str(step))
    # # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)

    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    
    # Forward pass
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    loss, logits = outputs[:2]

    # Accumulate the training loss over all of the batches so that we can
    # calculate the average loss at the end. `loss` is a_bert Tensor containing a_bert
    # single value; the `.item()` function just returns the Python value
    # from the tensor.
    tl_set.append(loss.item())

    # Backward pass
    loss.backward()
    
    # Update parameters and take a_bert step using the computed gradient
    optimizer.step()
    scheduler.step()

    # Update tracking variables
    total_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

    avg_train_loss = total_loss / nb_tr_steps
    print("Length: " + str(len(tl_set)))
    print("Average total train loss: {}".format(total_loss / nb_tr_steps))
    print("Total Loss for this epoch: " + str(total_loss))
    print("Number of steps for this epoch: " + str(nb_tr_steps))
        
  print("Training complete!")

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch:  0
run training
Step: 0
Length: 1
Average total train loss: 0.6403961181640625
Total Loss for this epoch: 0.6403961181640625
Number of steps for this epoch: 1
Step: 1
Length: 2
Average total train loss: 0.6720498502254486
Total Loss for this epoch: 1.3440997004508972
Number of steps for this epoch: 2
Step: 2
Length: 3
Average total train loss: 0.6815834641456604
Total Loss for this epoch: 2.044750392436981
Number of steps for this epoch: 3
Step: 3
Length: 4
Average total train loss: 0.6974967569112778
Total Loss for this epoch: 2.789987027645111
Number of steps for this epoch: 4
Step: 4
Length: 5
Average total train loss: 0.6864918947219849
Total Loss for this epoch: 3.4324594736099243
Number of steps for this epoch: 5
Step: 5
Length: 6
Average total train loss: 0.6997791826725006
Total Loss for this epoch: 4.198675096035004
Number of steps for this epoch: 6
Step: 6
Length: 7
Average total train loss: 0.6948278631482806
Total Loss for this epoch: 4.863795042037964
Number of step

Epoch:  50%|█████     | 1/2 [00:10<00:10, 10.42s/it]

Length: 40
Average total train loss: 0.4499012820422649
Total Loss for this epoch: 17.996051281690598
Number of steps for this epoch: 40
Training complete!
Epoch:  1
run training
Step: 0
Length: 1
Average total train loss: 0.23664115369319916
Total Loss for this epoch: 0.23664115369319916
Number of steps for this epoch: 1
Step: 1
Length: 2
Average total train loss: 0.3089156821370125
Total Loss for this epoch: 0.617831364274025
Number of steps for this epoch: 2
Step: 2
Length: 3
Average total train loss: 0.24768579006195068
Total Loss for this epoch: 0.743057370185852
Number of steps for this epoch: 3
Step: 3
Length: 4
Average total train loss: 0.2261471301317215
Total Loss for this epoch: 0.904588520526886
Number of steps for this epoch: 4
Step: 4
Length: 5
Average total train loss: 0.2827116012573242
Total Loss for this epoch: 1.413558006286621
Number of steps for this epoch: 5
Step: 5
Length: 6
Average total train loss: 0.28164132436116535
Total Loss for this epoch: 1.68984794616699

Epoch: 100%|██████████| 2/2 [00:20<00:00, 10.38s/it]

Length: 40
Average total train loss: 0.19584701769053936
Total Loss for this epoch: 7.833880707621574
Number of steps for this epoch: 40
Training complete!





In [None]:
#  Inference over the test data and print out the accuracy of the model

# Tracking variables
total_eval_accuracy = 0
nb_eval_steps, nb_eval_examples = 0, 0
predictions, true_labels = [], []

# Evaluate data for one epoch
for batch in test_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model_bert not to compute or store gradients, saving memory and speeding up validation
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      logits = outputs.logits
    
  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  #  flatten out our predictions and labels to calculate the accuracy
  pred_flat = np.argmax(logits, axis=1).flatten()
  labels_flat = label_ids.flatten()

  # b_input_ids - actual sentence 16 labels_flat 16 pred_flat 16

  # text, actual_label, pred_label - map
  #  calculate accuracy
  total_eval_accuracy += np.sum(pred_flat == labels_flat) / len(labels_flat)
  nb_eval_steps += 1

  # get predicitons to list
  predict_content = logits.argmax(axis=-1).flatten().tolist()

  # Store predictions and true labels
  predictions.append(predict_content)
  true_labels.append(label_ids)

print("Testing complete!")
print("Accuracy over the test set: {}".format(total_eval_accuracy / nb_eval_steps))

Testing complete!
Accuracy over the test set: 0.9268933140734628


 **Resizing the Test Labels**

In [None]:
test_label = []
for i in true_labels:
    for j in i:
        test_label.append(j)
 
test_label = np.array(test_label)
test_label

array([1, 0, 1, ..., 0, 0, 1])

**Resizing the predicted labels**

In [None]:
pred = []
for i in predictions:
    for j in i:
        pred.append(j)
 
pred = np.array(pred)
pred

array([1, 0, 1, ..., 1, 0, 1])

**Confusion_Matrix for Accuracy,precision,recall,f1 score**

In [None]:
from sklearn.metrics import confusion_matrix
Bert_Based = confusion_matrix(test_label, pred)
Bert_Based

array([[17649,  1624],
       [ 1208, 18276]])

In [None]:
TP1,TN1,FP1,FN1 = Bert_Based[1][1],Bert_Based[0][0],Bert_Based[0][1],Bert_Based[1][0]

In [None]:
Accuracy = (TP1+TN1)/(TP1+FP1+FN1+TN1)
Accuracy 

0.9269293288954253

In [None]:
Precision = TP1/(TP1+FP1)
Precision

0.918391959798995

In [None]:
Recall = TP1/(TP1+FN1)
Recall

0.9380004105933073

In [None]:
F1 = 2*(Recall * Precision) / (Recall + Precision)
F1

0.9280926264472883

In [None]:
target_names = ['class 0', 'class 1']
print(classification_report(test_label, pred, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.94      0.92      0.93     19273
     class 1       0.92      0.94      0.93     19484

    accuracy                           0.93     38757
   macro avg       0.93      0.93      0.93     38757
weighted avg       0.93      0.93      0.93     38757



**Gaussian NB Based Approach for classification**

In [None]:
GNB = GaussianNB()
gc = GNB.fit(train_inputs, train_Humor)

In [None]:
t_input = test_inputs.resize(77514,31)



In [None]:
predict = gc.predict(t_input)

In [None]:
p = np.resize(predict,(38757))

In [None]:
cm = confusion_matrix(test_Humor, p)
cm

array([[18314,   959],
       [18544,   940]])

In [None]:
TP,TN,FP,FN = cm[1][1],cm[0][0],cm[0][1],cm[1][0]

In [None]:
Accuracy = (TP+TN)/(TP+FP+FN+TN)
Accuracy 

0.49678767706478827

In [None]:
Precision = TP/(TP+FP)
Precision

0.49499736703528174

In [None]:
Recall = TP/(TP+FN)
Recall

0.04824471361116814

In [None]:
F1 = 2*(Recall * Precision) / (Recall + Precision)
F1

0.08792031052705421

**Random Forest Classifier**

In [None]:
clf=RandomForestClassifier(n_estimators=100)

In [None]:
#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(train_inputs, train_Humor)

RandomForestClassifier()

In [None]:
y_pred=clf.predict(t_input)

In [None]:
y_pred = np.resize(y_pred,(38757))

In [None]:
rm = confusion_matrix(test_Humor,y_pred)
rm

array([[14316,  4957],
       [14618,  4866]])

In [None]:
TP2,TN2,FP2,FN2 = rm[1][1],rm[0][0],rm[0][1],rm[1][0]

In [None]:
Accuracy_random_forest = (TP2+TN2)/(TP2+FP2+FN2+TN2)
Accuracy_random_forest

0.4949299481384008

In [None]:
Precision_rf = TP2/(TP2+FP2)
Precision_rf

0.4953680138450575

In [None]:
Recall_rf = TP2/(TP2+FN2)
Recall_rf

0.2497433791829193

In [None]:
F1_rf = 2*(Recall_rf * Precision_rf) / (Recall_rf + Precision_rf)
F1_rf

0.33207083631896817