In [4]:
import pandas as pd
import numpy as np
from joblib import dump, load
import spacy
import torch
from transformers import XLMRobertaModel, XLMRobertaTokenizer

from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/34812 CW NLI')

modela = load('./linear_trained_model.joblib')

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model = XLMRobertaModel.from_pretrained('xlm-roberta-base', output_hidden_states=True)

def preprocess(text):
    doc = nlp(text)
    tokens = [
        token.lemma_.lower() for token in doc
        if not token.is_punct
    ]
    return ' '.join(tokens)

def preprocess_and_encode(df, columns, max_length):
    combined_features = []
    # Target length for padding/truncation of the token embeddings
    target_embedding_length = 768
    batch_size = 16

    for column in columns:
        column_embeddings = []
        cleaned_texts = [preprocess(text) for text in df[column].dropna()]

        # Target length for padding/truncation of the token embeddings
        for i in range(0, len(cleaned_texts), batch_size):
            batch_texts = cleaned_texts[i:i + batch_size]
            # Tokenize the texts and encode with padding and truncation
            encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

            # Evaluate the model to get embeddings without gradient updates
            model.eval()
            with torch.no_grad():
                outputs = model(**encoded_inputs)
            embeddings = torch.stack(outputs.hidden_states[-4:]).mean(0).mean(dim=1).numpy()

            batch_embeddings = np.zeros((embeddings.shape[0], target_embedding_length))
            for j, emb in enumerate(embeddings):
                actual_length = min(target_embedding_length, emb.shape[0])
                batch_embeddings[j, :actual_length] = emb[:actual_length]

            column_embeddings.append(batch_embeddings)

        column_embeddings = np.vstack(column_embeddings)
        combined_features.append(column_embeddings)

    return np.hstack(combined_features)

def demo(input_data):
    input_data['premise'] = input_data['premise'].fillna('').astype(str)
    input_data['hypothesis'] = input_data['hypothesis'].fillna('').astype(str)
    features = preprocess_and_encode(input_data, ['premise', 'hypothesis'], max_length=128)
    predictions = modela.predict(features)
    return predictions


validation_df = pd.read_csv('./test.csv', keep_default_na=False)
results = demo(validation_df)
predictions = pd.DataFrame(results, columns=['prediction'])
print(predictions.head())
output_file_path = './Group_15_A.csv'
predictions.to_csv(output_file_path, index=False)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   prediction
0           1
1           1
2           1
3           1
4           1


In [None]:
# from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
# y_validation = validation_df['label'].values
# print(classification_report(y_validation, results))
# print(f"Accuracy: {accuracy_score(y_validation, results)}")
# print(f"Precision, Recall, F1 Score: {precision_recall_fscore_support(y_validation, results, average='weighted')[:3]}")


              precision    recall  f1-score   support

           0       0.59      0.80      0.68      3259
           1       0.72      0.48      0.57      3478

    accuracy                           0.63      6737
   macro avg       0.65      0.64      0.62      6737
weighted avg       0.65      0.63      0.62      6737

Accuracy: 0.6320320617485528
Precision, Recall, F1 Score: (0.6537482635574783, 0.6320320617485528, 0.6230948973111805)
