In [3]:
bool_rewrite = 0
model_name = 'bert'
pre_processing = False
cuda_NO = 'cuda:0'
test_num = 0
pre = "dataPre" if pre_processing else "noPre"
use_colab = 0
rand_seed = 42
num_labels = 3

print("____________________________________________________________________________________________________________")
print(f"Pre-trained model: {model_name}")

label_mapping = {
    'negative': 0,
    'positive': 1,
    'neutral': 2
}

if bool_rewrite == 0:
    print(f"Testing on ORIGINAL data:")
    model_save_path = f'trainedModels/nlp/airline/selfRef/{pre}/{model_name}-airline-weights-{test_num}.pth'
    pred_save_path = f'DS/Twitter-US-Airline-Sentiment/analysis_selfRef/{pre}/{model_name}-pred.csv'
else:
    print(f"Testing on REWRITTEN data:")
    model_save_path = f'trainedModels/nlp/airline/selfRef/{pre}/{model_name}-airline-weights-rewrite-{test_num}.pth'
    pred_save_path = f'DS/Twitter-US-Airline-Sentiment/analysis_selfRef/{pre}/{model_name}-rewrite-pred.csv'

import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

# Set Random Seed

In [2]:
import numpy as np
import random
import torch

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(rand_seed)

# Load Model and Tokenizer

In [3]:
from transformers import BertForSequenceClassification, RobertaForSequenceClassification, AlbertForSequenceClassification, DebertaForSequenceClassification, ElectraForSequenceClassification
from transformers import BertTokenizer, RobertaTokenizer, AlbertTokenizer, DebertaTokenizer, ElectraTokenizer

if model_name == "bert":
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
elif model_name == "roberta":
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels)
elif model_name == "albert":
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
    model = AlbertForSequenceClassification.from_pretrained("albert-base-v2", num_labels=num_labels)
elif model_name == "deberta":
    tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
    model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=num_labels)
elif model_name == "electra":
    tokenizer = ElectraTokenizer.from_pretrained("google/electra-base-discriminator")
    model = ElectraForSequenceClassification.from_pretrained("google/electra-base-discriminator", num_labels=num_labels)
else:
    raise ValueError(f"Model {model_name} not supported.")

# Data Processing

In [None]:
import re

def preprocess_text(text):
    # Check if text is a string
    if not isinstance(text, str):
        return ""

    # Lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove user @ references and '#' from hashtags
    text = re.sub(r'\@\w+|\#','', text)

    # Remove special characters, numbers, and punctuations
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', ' ', text)

    # Remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

# Read Data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_data = pd.read_csv('DS/Twitter-US-Airline-Sentiment/Tweets_selfRef.csv', sep=',', header=0)
if pre_processing:
    train_data['Clean-Text'] = train_data['Text'].apply(preprocess_text)
    train_data['Clean-Text-Rewrite'] = train_data['Text-Rewrite'].apply(preprocess_text)
else:
    train_data['Clean-Text'] = train_data['Text']
    train_data['Clean-Text-Rewrite'] = train_data['Text-Rewrite']

train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=rand_seed)
val_data = val_data[val_data['Original_Label'] != 'Original_Label']
test_data = val_data

if bool_rewrite == 0:
    train_tokenized = tokenizer(
        list(train_data['Clean-Text']), return_tensors='pt', padding=True, truncation=True)
    val_tokenized = tokenizer(
        list(val_data['Clean-Text']), return_tensors='pt', padding=True, truncation=True)
    test_tokenized = val_tokenized
else:
    train_tokenized = tokenizer(
        list(train_data['Clean-Text-Rewrite']), return_tensors='pt', padding=True, truncation=True)
    val_tokenized = tokenizer(
        list(val_data['Clean-Text-Rewrite']), return_tensors='pt', padding=True, truncation=True)
    test_tokenized = val_tokenized

# Prediction

In [None]:
model.load_state_dict(torch.load(model_save_path, weights_only=True))
# model.to(device)
# model = torch.load(model_save_path, weights_only=False)
model.eval()
with torch.no_grad():
    # outputs = model(**test_tokenized.to(device))
    outputs = model(**test_tokenized)

# Save the pred

In [None]:
predictions = torch.argmax(outputs.logits, dim=-1).cpu()
test_data['pred_label'] = predictions
test_data.to_csv(pred_save_path, sep=',', header=True, index=True)

# Calculate Acc

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = pd.read_csv(pred_save_path, sep=',', header=0)
df['Original_Label'] = df['Original_Label'].map(label_mapping)
correct_predictions = (df['Original_Label'] == df['pred_label']).sum()
total_predictions = len(df)

accuracy = accuracy_score(df['Original_Label'], df['pred_label'])
precision = precision_score(df['Original_Label'], df['pred_label'], average='weighted')
recall = recall_score(df['Original_Label'], df['pred_label'], average='weighted')
f1 = f1_score(df['Original_Label'], df['pred_label'], average='weighted')

print(f'Accuracy: {accuracy * 100:.2f}')
print(f'Precision: {precision * 100:.2f}')
print(f'Recall: {recall * 100:.2f}')
print(f'F1-Score: {f1 * 100:.2f}')