# SET FOLD FOR CV-FOLDS

In [1]:
FOLD = 0

# Imports

In [2]:
import torch
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math 
import string
import re
from sklearn.pipeline import Pipeline
from nltk.tokenize import wordpunct_tokenize
import nltk
from transformers import AutoTokenizer
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


# Initial reviews processing

In [3]:
lines = list()

with open("train.txt", "r") as fp:
    lines = fp.readlines()

In [4]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

## Separate reviews and target, white space segmentation and nltk tokenization

In [5]:
y_train = [x.lower().split('\t')[0] for x in lines]
x_train = [''.join(x.lower().split('\t')[1:]) for x in lines]
x_train = [wordpunct_tokenize(x) for x in x_train]

In [6]:
labels = set(y_train)

# Pre-trained Bert import, re-structuring data for package classes and functions

## Import model

In [7]:
model_checkpoint = "bert-base-uncased"
batch_size = 8

In [8]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

model = AutoModelForMultipleChoice.from_pretrained(model_checkpoint)

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Get possible labels

In [9]:
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

{'deceptivenegative',
 'deceptivepositive',
 'truthfulnegative',
 'truthfulpositive'}

In [10]:
id2label

{0: 'truthfulpositive',
 1: 'truthfulnegative',
 2: 'deceptivepositive',
 3: 'deceptivenegative'}

## Handle data format and CV splits

In [11]:
df_all = pd.DataFrame([' '.join(x) for x in x_train], columns=['text'])

In [12]:
yt = pd.DataFrame(y_train)
yt_np = np.array(y_train)

idx = np.random.permutation(df_all.index)
df_all = df_all.reindex(idx)
y_train = yt.reindex(idx)

df_train = df_all.iloc[:1000]
df_val = df_all.iloc[1000:]

y_val = y_train.iloc[1000:]
y_train = y_train.iloc[:1000]


df_train.head()

def divide_chunks(l, n): 
      
    # looping till length l 
    for i in range(0, len(l), n):  
        yield l[i:i + n] 
  

In [13]:
#df_all.iloc[[1,2,3:7]]

In [14]:
idx = np.random.RandomState(seed=42).permutation(df_all.index)
df_all = df_all.reindex(idx)
y_train = yt.reindex(idx)

x_trains = list()
y_trains = list()

N = 5

for var in range(N):
    len_fold = 1400/N
    x_trains.append((df_all.iloc[var*int(len_fold):(var+1)*int(len_fold)], df_all.iloc[np.r_[0:int(var*len_fold) , int((var+1)*len_fold):1400]] ))
    y_trains.append((y_train.iloc[var*int(len_fold):(var+1)*int(len_fold)], y_train.iloc[np.r_[0:int(var*len_fold), int((var+1)*len_fold):1400]] ))

#x_trains = list(divide_chunks(df_all, 280))
#y_trains = list(divide_chunks(y_train, 280))

In [None]:
#y_trains[3]

df_aux = df_train.copy()
curr_size = 1400
for el in df_aux.iterrows():
    print(el[0])
    length = 128
    while length + 256 < len(el[1]['text'].split(' ')):
        tex = el[1]['text'].split(' ')[length:length+256]
        #print(el[1]['text'])
        #print('len', len(el[1]['text'].split(' ')))
        #print(df_train.loc[el[0], 'text'])
        df_train.loc[el[0], 'text'] = ' '.join(el[1]['text'].split(' ')[:256])
        #print(df_train.loc[el[0], 'text'])
        new_df = pd.DataFrame(pd.Series({'text': ' '.join(tex)}, name = curr_size))
        #print(new_df.transpose())
        df_train = pd.concat([df_train, new_df.transpose()], ignore_index=False)
        new_df = pd.DataFrame(pd.Series(y_train.loc[el[0]], name = curr_size))
        print(new_df)#.transpose())
        y_train = pd.concat([y_train, new_df.transpose()], ignore_index=False)
        curr_size += 1
        length += 128
    

In [None]:
#y_trains[0][0]

### Get the splits and format for tokenization and padding

In [None]:
df_ = x_trains[FOLD] 
y_ = y_trains[FOLD]

df_train = df_[1]
df_val = df_[0]
y_train = y_[1]
y_val = y_[0]
print(len(df_train))
print(len(df_val))
print(len(y_train))
print(len(y_val))

_dct_train = df_train.to_dict()
_dct_train['text'] = list(_dct_train['text'].values())
_dct_train['label'] = y_train.apply(lambda x: label2id[x[0]], axis=1)

_dataset_train =  Dataset.from_dict(_dct_train)

_dct_val = df_val.to_dict()
_dct_val['text'] = list(_dct_val['text'].values())
_dct_val['label'] = y_val.apply(lambda x: label2id[x[0]], axis=1)


_dataset_val =  Dataset.from_dict(_dct_val)

_dataset_dct = dict()
_dataset_dct['train'] = _dataset_train
_dataset_dct['val'] = _dataset_val

### Tokenization, padding and further formatting

In [None]:
#_dataset_train[2]

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=256)

tokenized_train = _dataset_train.map(preprocess_function, batched=True)
tokenized_val = _dataset_val.map(preprocess_function, batched=True)


#tokenized_val[0]

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='max_length', max_length=256)

# Bert fine-tuning on reviews for classification

In [17]:
trainers = list()
models = list()


#!pip install evaluate

import evaluate

accuracy = evaluate.load("accuracy")

import numpy as np


def compute_metrics(eval_pred):
    predictions, labels_ = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels_)

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=4, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.05,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='eval_accuracy'
    #push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.model

trainer.evaluate()


1120
280
1120
280


Map: 100%|█████████████████████████████████████████████████████████████████| 1120/1120 [00:00<00:00, 3988.36 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 280/280 [00:00<00:00, 10494.29 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.641264,0.682143
2,No log,0.607631,0.767857
3,No log,0.604982,0.825
4,0.428900,0.72616,0.821429
5,0.428900,1.038088,0.803571
6,0.428900,0.993479,0.807143
7,0.428900,1.087021,0.810714
8,0.029200,1.041316,0.814286
9,0.029200,1.024258,0.817857
10,0.029200,1.000142,0.821429


In [18]:
error

NameError: name 'error' is not defined

# Saving model and getting test results

In [19]:
trainer.save_model(f"./bert_005_{FOLD}")

In [20]:
#model = trainer.model

In [21]:
text = "Bad. Super bad. Very bad hotel. worst hotel ever. I have everyone"

In [22]:
test_set = list()

with open("test_just_reviews.txt", "r") as fp:
    test_set = fp.readlines()

In [23]:
test_set = [' '.join(x.split(' ')[:512]) for x in test_set]

In [24]:
tokenizer = AutoTokenizer.from_pretrained(f"./bert_005_{FOLD}")

tokenizer(text, return_tensors="pt").to(device="cuda:0")

{'input_ids': tensor([[ 101, 2919, 1012, 3565, 2919, 1012, 2200, 2919, 3309, 1012, 5409, 3309,
         2412, 1012, 1045, 2031, 3071,  102]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}

In [25]:
tokenizer = AutoTokenizer.from_pretrained(f"./bert_005_{FOLD}")
tokenized_test = list()

for rev in test_set:
    inputs = tokenizer(rev, return_tensors="pt", truncation=True, max_length=256).to(device="cuda:0")
    tokenized_test.append(inputs)
#inputs.get_device()

In [26]:
results = list()

In [27]:
#tokenized_test

In [28]:
with torch.no_grad():
    for rev in tokenized_test:
        logits = model(**rev).logits
        predicted_class_id = logits.argmax().item()
        results.append(model.config.id2label[predicted_class_id])

In [29]:
results

['deceptivepositive',
 'deceptivenegative',
 'deceptivepositive',
 'truthfulnegative',
 'deceptivepositive',
 'truthfulnegative',
 'deceptivepositive',
 'deceptivepositive',
 'truthfulnegative',
 'deceptivenegative',
 'deceptivepositive',
 'deceptivepositive',
 'truthfulnegative',
 'truthfulnegative',
 'truthfulpositive',
 'deceptivepositive',
 'truthfulpositive',
 'truthfulpositive',
 'deceptivenegative',
 'truthfulpositive',
 'deceptivepositive',
 'deceptivepositive',
 'deceptivepositive',
 'deceptivenegative',
 'deceptivepositive',
 'deceptivenegative',
 'truthfulpositive',
 'truthfulpositive',
 'deceptivenegative',
 'deceptivenegative',
 'truthfulpositive',
 'deceptivepositive',
 'deceptivepositive',
 'deceptivenegative',
 'truthfulnegative',
 'deceptivenegative',
 'truthfulpositive',
 'deceptivepositive',
 'truthfulnegative',
 'deceptivenegative',
 'truthfulpositive',
 'truthfulnegative',
 'deceptivenegative',
 'truthfulpositive',
 'deceptivenegative',
 'truthfulnegative',
 'decep

In [30]:
with open(f"Results_005_{FOLD}.txt", "w") as txt_file:
    for line in results:
        txt_file.write("".join(line).upper() + "\n") # works with any number of elements in a line