# New Test Data

In [None]:
# Imports

import re
import torch
import contractions
import pandas as pd
import numpy as np
import gc

from symspellpy import SymSpell, Verbosity
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, AutoConfig

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

### Load final test data

In [None]:
final_test_data = pd.read_csv('../data/check.csv')

### Preprocessing final test data

In [None]:
# One-Hot-Encoding the labels

final_test_data['sentiment'] = final_test_data['sentiment'].map({'negative': 0, 'positive': 1})


# Removing HTML line-breaks + links

def remove_html_links(review_text):
    # Remove HTML line breaks <br />
    text = re.sub(r'<.*?>', ' ', review_text)
    # Remove http(s) links
    text = re.sub(r'http\S+', ' ', text)
    # Remove dots between capital letters
    text = re.sub(r'(?<=\b[A-Z])\.(?=[A-Z]\b)', '', text)
    # Remove parentheses with only numbers inside
    text = re.sub(r'\(\d+\)', '', text)
    # Remove parentheses with content where all words are capitalized
    text = re.sub(r'\(([A-Z][a-z]*(?: [A-Z][a-z]*)*)\)', '', text)
    # Remove all dots between letters and '!' or '?'
    text = re.sub(r'(?<=[a-zA-Z])\.+(?=[!?])', '', text)
    # Replace multiple '!', '?' or '-' with just one of each in sequence
    text = re.sub(r'[!?-]+', lambda x: ''.join(sorted(set(x.group(0)), key = x.group(0).find)), text)
    # Replace sequences of more than two identical letters with exactly two
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    # Replace '@' between letters with a space
    text = re.sub(r'(?<=[a-zA-Z])@(?!\s)', 'a', text)
    # Replace '\', '/' and '>' with a space
    text = re.sub(r'[\\/>]', ' ', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Replace ' & ' with 'and'
    cleaned_text = re.sub(r' \& ', ' and ', text)
    
    return cleaned_text

final_test_data['review'] = final_test_data['review'].apply(remove_html_links)


# Handling contractions

def expand_contractions(text):
    return contractions.fix(text)

final_test_data['review'] = final_test_data['review'].apply(expand_contractions)


# Applying spell check

sym_spell = SymSpell(max_dictionary_edit_distance = 2, prefix_length = 7)

sym_spell.load_dictionary('../data/frequency_dictionary_en_82_765.txt', term_index = 0, count_index = 1)

preserve = {'.', '?', '!', ',', '-', ':', ';', '(', ')'}

skip_chars = {'I'}

def spell_check(text):
    # Splitting text into words and punctuation marks
    tokens = re.findall(r'\w+|\S', text)
    corrected_tokens = []
    for token in tokens:
        if token.isalnum() and token not in preserve and token not in skip_chars:
            suggestions = sym_spell.lookup(token, Verbosity.CLOSEST, max_edit_distance = 2)
            corrected_token = suggestions[0].term if suggestions else token
            corrected_tokens.append(corrected_token)
        else:
            corrected_tokens.append(token)
    # Setting up corrected string
    result = ''
    for token in corrected_tokens:
        if token.isalnum() or token in preserve:
            if token in preserve:
                result += token
            else:
                result += ' ' + token
    return result.strip()

final_test_data['review'] = final_test_data['review'].apply(spell_check)


# Converting to lowercase

final_test_data['review'] = final_test_data['review'].str.lower()

### Load model, tokenizer and config

In [None]:
config = AutoConfig.from_pretrained('../models/distilbert/config.json')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('../models/distilbert', config = config)

### Tokenizing final test data

In [None]:
reviews = final_test_data['review'].tolist()
true_labels = final_test_data['sentiment'].tolist()

inputs = tokenizer(reviews, padding = True, truncation = True, return_tensors = 'pt')

### Making predictions

In [None]:
batch_size = 2
chunk_size = 100

predictions = []

for start in range(0, len(inputs['input_ids']), chunk_size):
    end = min(start + chunk_size, len(inputs['input_ids']))
    chunk_dataset = TensorDataset(inputs['input_ids'][start:end], inputs['attention_mask'][start:end])
    chunk_dataloader = DataLoader(chunk_dataset, batch_size = batch_size, num_workers = 0)

    with torch.no_grad():
        for batch in chunk_dataloader:
            batch_input_ids, batch_attention_mask = batch
            batch_inputs = {
                'input_ids': batch_input_ids.to('cpu'),
                'attention_mask': batch_attention_mask.to('cpu')
            }
            outputs = model(**batch_inputs)
            logits = outputs.logits
            batch_predictions = torch.argmax(logits, dim = 1).cpu().numpy()
            predictions.extend(batch_predictions)

            del batch_inputs, outputs
            gc.collect()

with open('../models/predictions.txt', 'w') as f:
    for item in predictions:
        f.write(f"{item}\n")

### Calculating metrics

In [None]:
with open('../models/predictions.txt', 'r') as file:
    predictions = [int(line) for line in file]

print('Truth:', true_labels[:25])
print('Preds:', predictions[:25],'\n')

accuracy = accuracy_score(true_labels, predictions)

print(f"Accuracy on the new test set: {accuracy:.2%}\n")

print(classification_report(true_labels, predictions, target_names = ['negative', 'positive']))