# Section 0: Import section

In [2]:
# helper functions
from helper_func_and_classes import TwitterDataset_BERT, BERTClassifier
from helper_func_and_classes import create_data_loader_BERT
from helper_func_and_classes import split_dataset
from helper_func_and_classes import create_dataset_list
from helper_func_and_classes import output_numpy_array_from_model_training
from helper_func_and_classes import train_one_epoch, evaluate_model
from helper_func_and_classes import create_predictions_BERT
from helper_func_and_classes import create_submission_file


# transformers
import transformers
from transformers import BertModel, BertTokenizer
from transformers import logging, AdamW, get_linear_schedule_with_warmup

# pytorch
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader


# data science
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# scikit learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

#other
import random
import warnings
from tqdm import tqdm



RANDOM_SEED = 123
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED);

In [11]:
# sets up cuda if gpu is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# setup of parameters
max_length = 37
batch_size = 128
DATA_FULL = True
test_size = 0.1
num_classes = 2
dropout_prob = 0.35
num_epoch = 4
learning_rate = 2.5e-5
correct_bias_val = False


# Section 1: Data preprocessing section
## Section 1.1: Creating lists of sentences

In [12]:
if DATA_FULL:
    pos_data_full = create_dataset_list("./twitter-datasets/train_pos_full.txt")
    pos_labels_full = [1]*len(pos_data_full)

    neg_data_full = create_dataset_list("./twitter-datasets/train_neg_full.txt")
    neg_labels_full = [0]*len(neg_data_full)

    all_data_full = pos_data_full + neg_data_full
    all_labels_full = pos_labels_full + neg_labels_full

    print("Length of all_data_full: ", len(all_data_full))
    print("Length of all_labels_full: ", len(all_labels_full), "\n")
    
else:    
    pos_data_lite = create_dataset_list("./twitter-datasets/train_pos.txt")
    pos_labels_lite = [1]*len(pos_data_lite)

    neg_data_lite = create_dataset_list("./twitter-datasets/train_neg.txt")
    neg_labels_lite = [0]*len(neg_data_lite)

    all_data_lite = pos_data_lite + neg_data_lite
    all_labels_lite = pos_labels_lite + neg_labels_lite

    print("Length of all_data_lite: ", len(all_data_lite))
    print("Length of all_labels_lite: ", len(all_labels_lite), "\n")

submission_data = create_dataset_list("./twitter-datasets/test_data.txt")





print("Length of submission_data: ",len(submission_data))

Length of all_data_lite:  200000
Length of all_labels_lite:  200000 

Length of all_data_full:  2500000
Length of all_labels_full:  2500000 

Length of submission_data:  10000


# Section 2: Binary classifier

In [14]:
#PRETRAINED_MODEL_BERT = 'bert-base-cased'
PRETRAINED_MODEL_BERT = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_BERT)

logging.set_verbosity_error()
warnings.simplefilter(action='ignore', category=FutureWarning)

In [15]:
if not DATA_FULL:
    train_samples_lite, test_samples_lite, train_labels_lite, test_labels_lite = train_test_split(
        all_data_lite, 
        all_labels_lite, 
        test_size=test_size, 
        random_state=RANDOM_SEED)
    
if DATA_FULL:
    train_samples_full, test_samples_full, train_labels_full, test_labels_full = train_test_split(
        all_data_full, 
        all_labels_full, 
        test_size=test_size, 
        random_state=RANDOM_SEED)

In [16]:
if not DATA_FULL:
    train_loader_lite = create_data_loader_BERT(
        train_samples_lite,
        train_labels_lite,
        tokenizer, 
        max_length, 
        batch_size)

    test_loader_lite = create_data_loader_BERT(
        test_samples_lite,
        test_labels_lite,
        tokenizer, 
        max_length, 
        batch_size)

if DATA_FULL:
    train_loader_full = create_data_loader_BERT(
        train_samples_full,
        train_labels_full,
        tokenizer, 
        max_length, 
        batch_size)

    test_loader_full = create_data_loader_BERT(
        test_samples_full,
        test_labels_full,
        tokenizer, 
        max_length, 
        batch_size)

In [17]:
# return_dict = False to remove error
model_bert = BertModel.from_pretrained(PRETRAINED_MODEL_BERT, return_dict=False)

In [18]:
model = BERTClassifier(
    num_classes=num_classes,
    p=dropout_prob,
    pretrained_model_name=PRETRAINED_MODEL_BERT)
model = model.to(device)

In [19]:
if not DATA_FULL:
    total_steps = len(train_loader_lite)*num_epoch
else:
    total_steps = len(train_loader_full)*num_epoch

optimizer = AdamW(model.parameters(), correct_bias=correct_bias_val, lr=learning_rate)

# setting up a scheduler with 30 000 warmup steps
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=30000,
  num_training_steps=total_steps
)


loss_func = nn.CrossEntropyLoss()

In [None]:
# setting up empty lists for plotting
train_loss_list = []
test_loss_list = []
train_accuracy_list = []
test_accuracy_list = []

In [None]:
%%time
if not DATA_FULL:
    for epoch in range(num_epoch):
        print("Epoch: ", int(epoch+1))
        train_accuracy, loss = train_one_epoch(
            model,
            train_loader_lite,
            loss_func,
            optimizer,
            device,
            scheduler,
            len(train_samples_lite)
          )

        print(f'Train loss {loss} accuracy {train_accuracy}')
        test_accuracy, test_loss = evaluate_model(
            model,
            test_loader_lite,
            device,
            loss_func,
            len(test_samples_lite)
          )
        print(f'Test loss {test_loss} accuracy {test_accuracy}\n')
        
        # appending data to lists
        train_loss_list.append(loss)
        test_loss_list.append(test_loss)
        train_accuracy_list.append(train_accuracy)
        test_accuracy_list.append(test_accuracy)


if DATA_FULL:
    for epoch in range(num_epoch):
        print("Epoch: ", int(epoch+1))
        train_accuracy, loss = train_one_epoch(
            model,
            train_loader_full,
            loss_func,
            optimizer,
            device,
            scheduler,
            len(train_samples_full)
          )

        print(f'Train loss {loss} accuracy {train_accuracy}')
        test_accuracy, test_loss = evaluate_model(
            model,
            test_loader_full,
            device,
            loss_func,
            len(test_samples_full)
          )
        print(f'Test loss {test_loss} accuracy {test_accuracy}\n')

        # appending data to lists
        train_loss_list.append(loss)
        test_loss_list.append(test_loss)
        train_accuracy_list.append(train_accuracy)
        test_accuracy_list.append(test_accuracy)



# Section 3: Creating submission

In [None]:
# submission data loader
submission_ids = []
for i in range(10000):
    submission_ids.append(i+1)
    
submission_loader = create_data_loader_BERT(
    submission_data,
    submission_ids, 
    tokenizer,
    max_length, 
    batch_size)
    


numpy_predictions = create_predictions_BERT(model, submission_loader, device)
numpy_predictions

In [None]:
create_submission_file(numpy_predictions)