# 34812 CW NLI

**Natural Language Inference (NLI)**

Given a premise and a hypothesis, determine if the hypothesis is true based on the
premise. You will be given more than 26K premise-hypothesis pairs as training data, and
more than 6K pairs as validation data.

In [None]:
import pandas as pd
import string
import re
import nltk
import numpy as np

# from google.colab import drive
# drive.mount('/content/drive')

# import os
# os.chdir('/content/drive/My Drive/34812 CW NLI')

training = pd.read_csv('./train.csv')
training.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Mounted at /content/drive


Unnamed: 0,premise,hypothesis,label
0,"However, Fort Charles was rebuilt as a militar...",Fort Charles was rebuilt as an amusement park ...,0
1,Buchanan's The Democrats and Republicans have...,THe parties will never be similar.,0
2,In order to review an acquisition that is usin...,The auditor only reviews the acquisition itsel...,0
3,Three young people sit outside and engage with...,There is a tin can and string telephone.,0
4,The lucrative tin mines of Kuala Lumpur in the...,The Chinese labor was seen as less costly and ...,1


#Preprocess & Feature embedding (using roberta)

In [None]:
import spacy
import torch
from transformers import XLMRobertaModel, XLMRobertaTokenizer

# Load small spacy model for lemma and basic pre-processing
# Disable parser and named entity recognition for efficiency
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model = XLMRobertaModel.from_pretrained('xlm-roberta-base', output_hidden_states=True)

def preprocess(text):
  # Lemma, lower, punc
    doc = nlp(text)
    tokens = [
        token.lemma_.lower() for token in doc
        if not token.is_punct
    ]
    return ' '.join(tokens)

def preprocess_and_encode(df, columns, max_length):
    combined_features = []
    # Target length for padding/truncation of the token embeddings
    target_embedding_length = 768
    batch_size = 16

    for column in columns:
        column_embeddings = []
        cleaned_texts = [preprocess(text) for text in df[column].dropna()]

        # Process texts in batches to avoid memory issues
        for i in range(0, len(cleaned_texts), batch_size):
          # Select a batch of cleaned texts
            batch_texts = cleaned_texts[i:i + batch_size]
            # Tokenize the texts and encode with padding and truncation
            encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

            # Evaluate the model to get embeddings without gradient updates
            # Dropout layer is deactivated and does not randomly discard activated units
            model.eval()
            # Not compute gradients, reducing memory consumption and increasing computational speed
            with torch.no_grad():
                outputs = model(**encoded_inputs)
            embeddings = torch.stack(outputs.hidden_states[-4:]).mean(0).mean(dim=1).numpy()

            # Create and accumulate embed for this batch
            batch_embeddings = np.zeros((embeddings.shape[0], target_embedding_length))
            # Ensure correct length
            for j, emb in enumerate(embeddings):
                actual_length = min(target_embedding_length, emb.shape[0])
                batch_embeddings[j, :actual_length] = emb[:actual_length]

            column_embeddings.append(batch_embeddings)

        column_embeddings = np.vstack(column_embeddings)
        combined_features.append(column_embeddings)

    return np.hstack(combined_features)



In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from scipy.stats import expon, randint
from sklearn.metrics import classification_report

X = preprocess_and_encode(training, ['premise', 'hypothesis'], 16)

y = training['label'].values


In [None]:
print("Features shape:", X.shape)
print("Labels shape:", y.shape)
print("Shape of training:", training.shape)
df = pd.DataFrame(X)
print(df.head())

Features shape: (26944, 1536)
Labels shape: (26944,)
Shape of training: (26944, 3)
       0         1         2         3         4         5         6     \
0 -0.332238  0.130232  0.064765  0.036203  0.185241 -0.312844 -0.030147   
1 -0.087927  0.006230  0.025774  0.117888  0.249238 -0.403839 -0.094374   
2 -0.276064  0.013500  0.065748  0.069111  0.559545 -0.170639  0.016059   
3 -0.066326  0.089424  0.002439 -0.119466  0.404508  0.050144  0.038555   
4 -0.223596  0.011834  0.050085 -0.006470  0.317016 -0.196609  0.019422   

       7         8         9     ...      1526      1527      1528      1529  \
0  0.177356 -0.137581  0.073043  ... -0.075536  0.132491 -0.126624  0.109179   
1  0.053350 -0.104217 -0.017730  ...  0.048344  0.018759  0.024928 -0.020654   
2 -0.025522 -0.073162  0.020797  ...  0.004020 -0.007390  0.042931 -0.013098   
3 -0.059568 -0.067016 -0.083397  ... -0.027364  0.115653  0.026358  0.025353   
4 -0.084960 -0.154758 -0.219454  ... -0.075116  0.106408  0.032885

#Linear SVC model training

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.svm import LinearSVC

# Choose to use a different scaler to define the pipeline steps
# Scalers are used to normalise the data before feeding it into the SVM classifier
pipeline_options = [
    ('scaler_standard', StandardScaler()),
    ('scaler_minmax', MinMaxScaler()),
    ('scaler_robust', RobustScaler()),
    ('linear_svc', LinearSVC(dual=False, max_iter=20000))
]

pipeline = Pipeline(pipeline_options)

# Define the distribution of parameters to try in the GridSearch
# Best parameters：'linear_svc__C': 0.01, 'linear_svc__class_weight': None,
# 'scaler_standard__with_mean': True, 'scaler_standard__with_std': True
param_distributions = {
    'linear_svc__C': [0.01, 0.1, 1, 10, 100],  # regularization strength
    'linear_svc__class_weight': [None, 'balanced'],  # options for balancing classes or not
    'scaler_standard__with_mean': [True, False],  # whether to center the data before scaling
    'scaler_standard__with_std': [True, False],   # whether to scale data to unit variance
}

# Set GridSearch with pipeline, parameter distribution
grid_search = GridSearchCV(
    pipeline,
    param_distributions,
    cv=10,       # number of folds in cross-validation
    verbose=2,    # higher number gives more verbose output
    n_jobs=-1   # -1: using all processors
)
# Find the best parameter
grid_search.fit(X, y)

print("Best parameters found: ", grid_search.best_params_)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


  pid = os.fork()


Best parameters found:  {'linear_svc__C': 0.01, 'linear_svc__class_weight': None, 'scaler_standard__with_mean': True, 'scaler_standard__with_std': True}


In [None]:
from joblib import dump, load

dump(grid_search, 'linear_trained_model.joblib')
modela = load('./linear_trained_model.joblib')

#Evaluation

In [None]:
# Load validation data
validation_df = pd.read_csv('./dev.csv', keep_default_na=False)
validation_df['premise'] = validation_df['premise'].fillna('').astype(str)
validation_df['hypothesis'] = validation_df['hypothesis'].fillna('').astype(str)

# Preprocessing and coding of validation data
X_validation = preprocess_and_encode(validation_df, ['premise', 'hypothesis'], 128)
y_validation = validation_df['label'].values

y_pred = modela.predict(X_validation)

from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
print(classification_report(y_validation, y_pred))
print(f"Accuracy: {accuracy_score(y_validation, y_pred)}")
print(f"Precision, Recall, F1 Score: {precision_recall_fscore_support(y_validation, y_pred, average='weighted')[:3]}")
# Accuracy: 0.6320320617485528
# Precision, Recall, F1 Score: (0.6537482635574783, 0.6320320617485528, 0.6230948973111805)

              precision    recall  f1-score   support

           0       0.59      0.80      0.68      3259
           1       0.72      0.48      0.57      3478

    accuracy                           0.63      6737
   macro avg       0.65      0.64      0.62      6737
weighted avg       0.65      0.63      0.62      6737

Accuracy: 0.6320320617485528
Precision, Recall, F1 Score: (0.6537482635574783, 0.6320320617485528, 0.6230948973111805)


In [None]:
# predictions = pd.DataFrame(y_pred, columns=['prediction'])
# print(predictions.head())
# output_file_path = './Group_15_A.csv'
# predictions.to_csv(output_file_path, index=False)

   prediction
0           0
1           0
2           0
3           1
4           1
