In [None]:
#!/usr/bin/env python
# coding: utf-8

# In[14]:


# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, XLMRobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)
import matplotlib
from tqdm import tqdm, trange
matplotlib.use('AGG')
import matplotlib.pyplot as plt
import transformers
import torch
import demoji
import os
from datetime import datetime

# In[2]:


# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'


# In[3]:

DN = 'Senti'
# Defining dataset and key variables that will be used later on in the training



if DN == 'Senti':
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 64
    VALID_BATCH_SIZE = 32
    EPOCHS = 15
    LEARNING_RATE = 1e-05
    tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
    FRAC = 0.001
    NUM_LABELS = 2
    
    
    data = pd.read_pickle('../Data/SemEval2017_data')
    print(data.head())
    data_1 = data[['text','label']]
    data_1 = data_1.rename({'text': 'Phrase',
                           'label':'Sentiment'}, axis='columns')
    new_df = data_1.sample(frac= FRAC, replace=False, random_state=1)




class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Phrase
        self.targets = self.data.Sentiment
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }


# In[17]:


train_size = 0.8
train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = SentimentData(train_data, tokenizer, MAX_LEN)
testing_set = SentimentData(test_data, tokenizer, MAX_LEN)


# In[18]:


train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
# In[20]:



class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("xlm-roberta-large")
        self.pre_classifier = torch.nn.Linear(1024, 1024)
        self.dropout = torch.nn.Dropout(0.5)
        self.classifier = torch.nn.Linear(1024, NUM_LABELS)
        self.act = torch.nn.Softmax(1)
        self.id =  datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

    def forward(self, input_ids, attention_mask,token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        #print('hidden',hidden_state.shape)
        pooler = self.pre_classifier(pooler)
        #print('pre_classifier',pooler.shape)
        pooler = torch.nn.ReLU()(pooler)
        #print('ReLU',pooler.shape)
        pooler = self.dropout(pooler)
        #print('dropout',pooler.shape)
        pooler = self.classifier(pooler)
        #print('classifier',pooler.shape)
        output = self.act(pooler)
        #print('act',output.shape)
        return output


model = RobertaClass()
model.to(device)

dr = '../Model/{}_{}_{}/'.format(model.id,DN,FRAC)
if not os.path.exists(dr):
    os.makedirs(dr)

# In[23]:


# The loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)


# In[24]:


def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

train_loss_set = []
val_loss_set = []
val_flat_accuracy_set = []


# In[ ]:


# Defining the training function on the 80% of the dataset for tuning

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        outputs = model(ids, mask,token_type_ids)
        #print('targets',targets)
        #print('outputs',outputs)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")
            with open(dr + 'file.txt', 'a') as f:
                print(f"Training Loss per 5000 steps: {loss_step}",file=f)
                print(f"Training Accuracy per 5000 steps: {accu_step}",file=f)


        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    with open(dr + 'file.txt', 'a') as f:
        print(f'The Total Accuracy for Epoch {epoch}: {(n_correct)/nb_tr_examples}',file=f)
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")
    with open(dr + 'file.txt', 'a') as f:
        print(f"Training Loss Epoch: {epoch_loss}",file=f)
    with open(dr + 'file.txt', 'a') as f:
        print(f"Training Accuracy Epoch: {epoch_accu}",file=f)

    return 


# In[ ]:


for epoch in trange(EPOCHS):
    train(epoch)


# In[25]:


def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    with open(dr + 'file.txt', 'a') as f:
        print(f"Validation Loss Epoch: {epoch_loss}",file=f)
    with open(dr + 'file.txt', 'a') as f:
        print(f"Validation Accuracy Epoch: {epoch_accu}",file=f)
    
    
    return epoch_accu


# In[ ]:


acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)
with open(dr + 'file.txt', 'a') as f:
    print("Accuracy on test data = %0.2f%%" % acc,file=f)

# In[ ]:


output_model_file = dr + 'pytorch_roberta_sentiment.bin'
#output_vocab_file = dr + 'output_vocab_file/'

model_to_save = model
torch.save(model_to_save, output_model_file)
#tokenizer.save_vocabulary(output_vocab_file)


