# Details of solution 1 implementation

Traditional Supervised Machine Learning Method

In [49]:
!pip3 install pyspellchecker textstat

Defaulting to user installation because normal site-packages is not writeable


In [50]:
import numpy as np
import pandas as pd
import nltk
import string
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import copy
import time

import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
from torch import Tensor

import spellchecker.spellchecker as spellchecker
import textstat

nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('universal_tagset')

torch.manual_seed(42)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/benjamin/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/benjamin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/benjamin/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


<torch._C.Generator at 0x7a65bc5a2330>

# Torch Classes

## Binary Classifier

In [47]:
class BinaryClassifier(nn.Module):
    # Simple binary classifier
    def __init__(self, input_size: int):
        super(BinaryClassifier, self).__init__()
        # Input is a parameter and then just has the three layers
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Uses ReLU after each but the final layer
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

## Text Dataset

In [6]:
class FeatureDataset(Dataset):
    def __init__(self, data: pd.DataFrame, transform = None):
        # Creates the dataset by converting vectors to tensors
        self.data = self.convert_to_tensors(data)
        self.transform = transform

    def __len__(self) -> int: # Simple len functionality
        return len(self.data)

    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        # Returns the vector and the label separately
        return self.data[idx][0], self.data[idx][1]

    def convert_to_tensors(self, data: pd.DataFrame) -> list[list[Tensor]]:
        # Takes all but the last value as that is the label and the the label, converting them to tensors
        feature_vecs = torch.tensor(data.iloc[:, :-1].values, dtype=torch.float)
        labels = torch.tensor(data.iloc[:, -1].values, dtype=torch.float).unsqueeze(1)

        # Appends results to a list as they have to come in pairs
        ret_list = []
        for i in range(len(data)):
            ret_list.append([feature_vecs[i], labels[i]])
        return ret_list

In [46]:
class TestFeatureDataset(Dataset):
    # Same as above, just without the label columns (so simpler)
    def __init__(self, data: pd.DataFrame, transform = None):
        self.data = self.convert_to_tensors(data)
        self.transform = transform

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx: int) -> torch.Tensor:
        return self.data[idx]

    def convert_to_tensors(self, data: pd.DataFrame) -> list[Tensor]:
        return torch.tensor(data.values, dtype=torch.float)
        

# Feature Vector Code

## Components

In [44]:
def get_punct_info(words: list[str], text: str, verbose: bool):
    # Gets a list of all punctuation, and defines what we call simple punctuation
    all_puncts = string.punctuation
    simple_puncts = [",", ".", "!", "?", '"', "'"]

    # Initialise the counts
    punct_count = 0
    simple_punct_count = 0

    # Loop through each "word" (includes punctuation) to increase counts
    for word in words:
        if word in all_puncts:
            punct_count += 1
        if word in simple_puncts:
            simple_punct_count += 1

    # Gets the amount of complex punctuation
    complex_punct_count = punct_count - simple_punct_count

    # Calculates the ratio (complexity) of punctuation
    punct_complexity = complex_punct_count / punct_count if punct_count != 0 else 0

    # Creates the punctuation distribution
    punct_dist = np.zeros(len(all_puncts))
    for char in all_puncts:
        punct_dist[all_puncts.index(char)] = text.count(char)

    # Normalises the distribution
    punct_dist = np.zeros(len(all_puncts)) if sum(punct_dist) == 0 else punct_dist / sum(punct_dist)

    if verbose:
        print(f"Punctuation count: {punct_count}")
        print(f"Punctuation complexity: {punct_complexity}")
        print(f"Punctuation distribution: {punct_dist}")
    
    return punct_count, punct_complexity, punct_dist

In [43]:
def get_dt_nn(tagged, verbose: bool = False):
    # Set counts to 0
    det_count = 0
    nn_count = 0

    # Checks each word in the tagged dataset to increase counts
    for word in tagged:
        if word[1] == "DET":
            det_count += 1
        if word[1] == "NOUN":
            nn_count += 1

    # Calculates ratio with div by 0 check
    dt_nn_ratio = 0 if nn_count == 0 else det_count / nn_count

    if verbose:
        print(f"DT count: {det_count}")
        print(f"NN count: {nn_count}")
        print(f"DT:NN ratio: {dt_nn_ratio}")

    return dt_nn_ratio

In [42]:
def get_word_len_features(words: list[str], verbose: bool = False):
    # Gets the word lengths for every word
    word_lengths = [len(word) for word in words]

    # Calculates the stats
    word_len_range = max(word_lengths) - min(word_lengths)
    word_len_q3 = np.percentile(word_lengths, 75)

    if verbose:
        print(f"Word length range: {word_len_range}")
        print(f"Word length Q3: {word_len_q3}")

    return word_len_range, word_len_q3

In [41]:
def get_sent_len_features(sents: list[str], verbose: bool = False):
    # Gets the list of sentence lengths
    sent_lengths = [len(nltk.word_tokenize(sent)) for sent in sents]

    # Simply calculates the stats
    sent_len_range = max(sent_lengths) - min(sent_lengths)
    sent_len_q3 = np.percentile(sent_lengths, 75)

    if verbose:
        print(f"Sentence length range: {sent_len_range}")
        print(f"Sentence length Q3: {sent_len_q3}")

    return sent_len_range, sent_len_q3

In [40]:
def check_caps(tagged, sents: list[str], verbose: bool = False):
    # Gets a list of all proper nouns
    proper_nouns = [word for word in tagged if word[1] == "NNP"]
    correct_caps_prop_nouns = 0

    # Iterates through and checks if they're correctly capitalised
    for word in proper_nouns:
        if word[0][0].isupper():
            correct_caps_prop_nouns += 1

    # Calculate the ratio (with check for if there are no proper nounds)
    prop_noun_cap_ratio = 0 if len(proper_nouns) == 0 else correct_caps_prop_nouns / len(proper_nouns)

    # Loops through each sentence to check first letter
    correct_caps_sos = 0
    for sent in sents:
        if sent[0][0].isupper():
            correct_caps_sos = 0

    # Calculates ratio
    sos_cap_ratio = correct_caps_sos / len(sents)

    if verbose:
        print(f"Proper noun caps ratio: {prop_noun_cap_ratio}")
        print(f"SOS capitalisation ratio: {sos_cap_ratio}")

    return prop_noun_cap_ratio, sos_cap_ratio

In [39]:
def get_typo_stats(words: list[str], verbose: bool = False):
    # Calculates the typo ratio using pyspellchecker
    spell = spellchecker.SpellChecker("en")
    misspelled = spell.unknown(words)
    typo_ratio = len(misspelled) / len(words)

    if verbose:
        print(f"Typo ratio: {typo_ratio}")

    return typo_ratio

In [37]:
def get_type_token(words: list[str], verbose: bool = False):
    # Uses set to ensure unique entries to calculate type token ratio
    type_token_ratio = len(set(words)) / len(words)

    if verbose:
        print(f"Type-Token Ratio: {type_token_ratio}")

    return type_token_ratio

In [15]:
def get_readability(text: str, verbose: bool = False):
    # Simply get the Flesch-Kincaid Grade
    readability = textstat.textstat.flesch_kincaid_grade(text)

    if verbose:
        print(f"Reading grade: {reability}")

    return readability

## Main Feature Vector Code

In [36]:
def get_feature_vector(text: str, verbose: bool = False) -> np.ndarray:
    # Generates the required forms of the text for each calculation
    words = text.split()
    sents = nltk.sent_tokenize(text)
    tagged_uni = nltk.pos_tag(words, tagset="universal")
    tagged_reg = nltk.pos_tag(words)

    # Initialises the empty feature vector
    feat_vec = np.ndarray(0, dtype=np.float32)

    punc_count, punct_complexity, punct_dist = get_punct_info(words, text, verbose) # Mostly self-explanatory

    dt_nn_ratio = get_dt_nn(tagged_uni, verbose)

    word_len_range, word_len_q3 = get_word_len_features(words, verbose)

    sent_len_range, sent_len_q3 = get_sent_len_features(sents, verbose)

    prop_noun_cap_ratio, sos_cap_ratio = check_caps(tagged_reg, sents, verbose)

    typo_ratio = get_typo_stats(words, verbose)

    type_token_ratio = get_type_token(words, verbose)

    readability = get_readability(text, verbose)
    
    # Appends the results to the vector
    feat_vec = np.append(feat_vec, punct_dist)
    feat_vec = np.append(feat_vec, punct_complexity)
    feat_vec = np.append(feat_vec, dt_nn_ratio)
    feat_vec = np.append(feat_vec, word_len_range)
    feat_vec = np.append(feat_vec, word_len_q3)
    feat_vec = np.append(feat_vec, sent_len_range)
    feat_vec = np.append(feat_vec, sent_len_q3)
    feat_vec = np.append(feat_vec, prop_noun_cap_ratio)
    feat_vec = np.append(feat_vec, sos_cap_ratio)
    feat_vec = np.append(feat_vec, typo_ratio)
    feat_vec = np.append(feat_vec, type_token_ratio)
    feat_vec = np.append(feat_vec, readability)

    # Returns the vector
    return feat_vec.reshape(1, -1)

# Implementation

In [35]:
def create_feature_vecs(file_path, name):
    # Creates the feature vectors that populate the datasets and outputs them for quicker runtime
    # feat_vec_size is just the number of data points
    feat_vec_size = 43

    # Read in text data
    df = pd.read_csv(file_path)

    # Creates 3 * 43 columns, +1 for label
    col_names = [f"fv_1_{i}" for i in range(feat_vec_size)] + [f"fv_2_{i}" for i in range(feat_vec_size)] + [f"fv_diff_{i}" for i in range(feat_vec_size)] + ["label"]

    feat_vec_df = pd.DataFrame(columns=col_names)

    # Calculates the feature vector for sentence 1, then sentence 2, then calculates the absolute difference
    for i in range(len(df)):
        if i % 500 == 0:
            print(f"\rProcessing pair {i}/{len(df)}", end="")

        label = df["label"]

        s1_fv = get_feature_vector(df["text_1"][i]).flatten()
        s2_fv = get_feature_vector(df["text_2"][i]).flatten()
        fv_diff = np.abs(s1_fv - s2_fv)

        # Concatenates together to form a row
        row = np.concatenate((s1_fv, s2_fv, fv_diff, [label[i]]), axis=0)

        # Appends row to dataframe
        feat_vec_df = feat_vec_df._append(pd.DataFrame([row], columns=col_names), ignore_index=True)

    # Outputs final dataframe to CSV
    feat_vec_df.to_csv(f"{name}_feature_vectors.csv", index=False)

In [34]:
def create_feature_vecs_test(file_path, name):
    # Creates the feature vectors that populate the datasets and outputs them for quicker runtime
    # feat_vec_size is just the number of data points
    feat_vec_size = 43

    # Read in text data
    df = pd.read_csv(file_path)

    # Defines the same columns as above just without the label 
    col_names = [f"fv_1_{i}" for i in range(feat_vec_size)] + [f"fv_2_{i}" for i in range(feat_vec_size)] + [f"fv_diff_{i}" for i in range(feat_vec_size)]

    feat_vec_df = pd.DataFrame(columns=col_names)

    # Calculate each feature vector and adds it to the dataframe
    for i in range(len(df)):
        if i % 500 == 0:
            print(f"\rProcessing pair {i}/{len(df)} (as test data)", end="")

        s1_fv = get_feature_vector(df["text_1"][i]).flatten()
        s2_fv = get_feature_vector(df["text_2"][i]).flatten()
        fv_diff = np.abs(s1_fv - s2_fv)
        
        row = np.concatenate((s1_fv, s2_fv, fv_diff), axis=0)

        feat_vec_df = feat_vec_df._append(pd.DataFrame([row], columns=col_names), ignore_index=True)

    # Outputs the final csv
    feat_vec_df.to_csv(f"{name}_feature_vectors.csv", index=False)

In [33]:
def get_feat_dataset(df: pd.DataFrame, test: bool = False):
    # Returns the feature vector dataset - type depending on the input
    if not test:
        return FeatureDataset(df)
    else:
        return TestFeatureDataset(df)

In [32]:
def train_classifier():
    # Training algorithm
    # First loads training and testing datasets
    print("Beginning Training")
    feat_vec_train_df = pd.read_csv("training_feature_vectors.csv")
    feat_vec_eval_df = pd.read_csv("eval_feature_vectors.csv")
    feat_vec_train_ds = get_feat_dataset(feat_vec_train_df)
    feat_vec_eval_ds = get_feat_dataset(feat_vec_eval_df)
    feat_vec_train_dl = DataLoader(feat_vec_train_ds, batch_size=16, shuffle=True)
    feat_vec_test_dl = DataLoader(feat_vec_eval_ds, batch_size=1, shuffle=False)

    # Defines the model, loss function, and optimiser
    feat_vec_size = feat_vec_train_df.shape[1] - 1
    model = BinaryClassifier(feat_vec_size)
    criterion = nn.BCEWithLogitsLoss()
    optimiser = torch.optim.Adam(model.parameters(), lr=0.00005)

    # Defines other hyperparameters
    epochs = 200
    threshold = 0

    # Initialises the variable used for memory
    current_best = (None, 0.0, -1)

    # Main training loop
    for i in range(epochs):
        # Loop makes prediction, calculates loss, and then backpropagates
        loss_list = []
        model.train()
        for fv, label in feat_vec_train_dl:
            optimiser.zero_grad()
            output = model(fv)
            loss = criterion(output, label)
            loss_list.append(loss.detach().numpy())
            loss.backward()
            optimiser.step()

        print(f"Mean Training Loss for Epoch {i}: {np.mean(loss_list)}")

        # Collect statistics on the eval set (same logic as below)
        loss_list = []
        preds = []
        true_labels = []

        model.eval()
        with torch.no_grad():
            for fv, label in feat_vec_test_dl:
                output = model(fv)
                loss = criterion(output, label)
                loss_list.append(loss.detach().numpy())
                preds.append(1 if output[0] > threshold else 0)
                true_labels.append(label.detach().numpy())

        # Collect simple metrics
        correct = 0
        TPs, FPs, FNs, TNs = 0, 0, 0, 0
        for j in range(len(true_labels)):
            if true_labels[j] == preds[j]:
                correct += 1
                if true_labels[j] == 1:
                    TPs += 1
                else:
                    TNs += 1
            else:
                if true_labels[j] == 1:
                    FNs += 1
                else:
                    FPs += 1

        # Calculate complex metrics
        accuracy = correct / len(true_labels)
        recall = 0 if TPs + FNs == 0 else TPs / (TPs + FNs)
        precision = 0 if TPs + FPs == 0 else TPs / (TPs + FPs)
        f1 = 0 if precision + recall == 0 else 2 * (precision * recall) / (precision + recall)

        # Outputs results
        print(f"Testing Stats for Epoch {i}")
        print(f"    Mean Loss: {np.mean(loss_list)}")
        print(f"    Accuracy: {accuracy}")
        print(f"    _____________________________")
        print(f"    |True\Pred|Positive|Negative|")
        print(f"    |---------|--------|--------|")
        print(f"    |Positive |TPs: {TPs}|FNs: {FNs}|")
        print(f"    |---------|--------|--------|")
        print(f"    |Negative |FPs: {FPs}|TNs: {TNs}|")
        print(f"    |---------|--------|--------|")
        print(f"    ")
        print(f"    Precision: {precision}")
        print(f"    Recall: {recall}")
        print(f"    F1 Score: {f1}")

        # Checks to see if the current state is the best model
        if accuracy > current_best[1]:
            current_best = (copy.deepcopy(model), accuracy, i)

    # Outputs the best model's accuracy, and then returns the best model
    print(f"Got best performance at epoch {current_best[2]} with accuracy {current_best[1]}")
    
    return current_best[0]

In [31]:
def eval_classifier(model, datapath="eval_feature_vectors.csv"):
    # Function for evaluating the model on the eval dataset
    # Reads the csv and converts to data loader
    feat_vec_df = pd.read_csv(datapath)
    feat_vec_ds = get_feat_dataset(feat_vec_df)
    feat_vec_dl = DataLoader(feat_vec_ds, batch_size=1, shuffle=False)

    # Defines variables and functions for statistics (BCE for loss function)
    criterion = nn.BCEWithLogitsLoss()

    loss_list = []
    preds = []
    true_labels = []

    threshold = 0

    # Ensures eval mode then loops through the dataset to get predictions and other stats
    model.eval()

    with torch.no_grad():
        for fv, label in feat_vec_dl:
            output = model(fv)
            loss = criterion(output, label)
            loss_list.append(loss.detach().numpy())
            preds.append(1 if output[0] > threshold else 0)
            true_labels.append(label.detach().numpy())

    # Calculates simple metrics
    correct = 0
    TPs = 0
    FPs = 0
    TNs = 0
    FNs = 0

    for i in range(len(true_labels)):
        if true_labels[i] == preds[i]:
            correct += 1
            if true_labels[i] == 1:
                TPs += 1
            else:
                TNs += 1
        else:
            if true_labels[i] == 1:
                FNs += 1
            else:
                FPs += 1

    # Calculates complex metrics
    accuracy = correct / len(true_labels)
    recall = 0 if TPs + FNs == 0 else TPs / (TPs + FNs)
    precision = 0 if TPs + FPs == 0 else TPs / (TPs + FPs)
    f1 = 0 if precision + recall == 0 else 2 * (precision * recall) / (precision + recall)

    # Outputs results
    print(f"Testing Stats for Classifier")
    print(f"    Mean Loss: {np.mean(loss_list)}")
    print(f"    Accuracy: {accuracy}")
    print(f"    _____________________________")
    print(f"    |True\Pred|Positive|Negative|")
    print(f"    |---------|--------|--------|")
    print(f"    |Positive |TPs: {TPs}|FNs: {FNs}|")
    print(f"    |---------|--------|--------|")
    print(f"    |Negative |FPs: {FPs}|TNs: {TNs}|")
    print(f"    |---------|--------|--------|")
    print(f"    ")
    print(f"    Precision: {precision}")
    print(f"    Recall: {recall}")
    print(f"    F1 Score: {f1}")

    return (accuracy, recall, precision, f1)

In [30]:
def test_classifier(model, data_path):
    # Simple function to test the classifier
    # Takes the test feature vectors (based on supplied data path) and loads it into a dataset
    # Then it loops hrough each input and sets the prediction
    feat_vec_df = pd.read_csv(data_path)
    test_ds = get_feat_dataset(feat_vec_df, True)
    test_dl = DataLoader(test_ds, batch_size=1, shuffle=False)

    model.eval()
    threshold = 0

    preds = []
    
    with torch.no_grad():
        for X in test_dl:
            output = model(X)
            preds.append(1 if output[0] > threshold else 0)

    return preds

## Notebook "Engine"

In [23]:
# Inference Mode Switch
generate_new_feat_vecs = False
inference_mode = True
run_next = False

In [25]:
torch.manual_seed(42)
if run_next:
    if generate_new_feat_vecs:
        create_feature_vecs("Data/train.csv", "training")
        create_feature_vecs("Data/dev.csv", "eval")
        create_feature_vecs_test("Data/test.csv", "testing")
    if not inference_mode:
        train_time_start = time.time()
        classifier = train_classifier()
        print(f"Training time: {time.time() - train_time_start}")
        torch.save(classifier, "solution_1_classifier.pth")
        results = eval_classifier(classifier)
    else:
        classifier = torch.load("solution_1_classifier.pth", weights_only=False)
        eval_classifier(classifier)
        predictions = test_classifier(classifier, "testing_feature_vectors.csv")
        preds_df = pd.DataFrame(predictions, columns=["prediction"])
        preds_df.to_csv("Group_17_A.csv", index=False)

In [29]:
# Run for new data
new_path = ""
new_path_is_test_data = True
if new_path_is_test_data:
    create_feature_vecs_test(new_path, "new_data")
    classifier = torch.load("solution_1_classifier.pth", weights_only=False)
    predictions = test_classifier(classifier, "new_data_feature_vectors.csv")
    preds_df = pd.DataFrame(predictions, columns=["prediction"])
    print(preds_df)
else:
    create_feature_vecs(new_path, "new_data")
    classifier = torch.load("solution_1_classifier.pth", weights_only=False)
    eval_classifier(classifier, "new_data_feature_vectors.csv")

FileNotFoundError: [Errno 2] No such file or directory: ''