First We need to Preprocess and Clean the data

In [76]:
import numpy as np
import pandas as pd
import nltk
import re
from collections import Counter

# Download NLTK data if needed
nltk.download('punkt')

class TextPreprocessor:
    def __init__(self):
        self.max_sentence_length = 0
        self.labels_dict = {}  # Store labels mapping here

    def clean_text(self, text):
        # Your text cleaning code here
        text = text.lower()
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=<>;]", " ", text)
        text = re.sub(r";", " ", text)
        text = re.sub(r",", " ", text)
        text = re.sub(r"\.", " ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\/", " ", text)
        text = re.sub(r"\^", " ^ ", text)
        text = re.sub(r"\+", " + ", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r"\=", " = ", text)
        text = re.sub(r"'", " ", text)
        text = re.sub(r"\s{2,}", " ", text)
        text = re.sub(r"\'s", " ", text)

        return text

    def load_data(self, path):
        data = []
        lines = [line.strip() for line in open(path)]

        for idx in range(0, len(lines), 4):
            id = lines[idx].split("\t")[0]
            relation = lines[idx + 1]

            sentence = lines[idx].split("\t")[1][1:-1]
            sentence = sentence.replace('<e1>', ' _e11_ ')
            sentence = sentence.replace('</e1>', ' _e12_ ')
            sentence = sentence.replace('<e2>', ' _e21_ ')
            sentence = sentence.replace('</e2>', ' _e22_ ')

            sentence = self.clean_text(sentence)
            tokens = nltk.word_tokenize(sentence)
            if self.max_sentence_length < len(tokens):
                self.max_sentence_length = len(tokens)
            sentence = " ".join(tokens)

            data.append([id, sentence, relation])

        df = pd.DataFrame(data=data, columns=["id", "sentence", "relation"])

        # Convert relation labels to numeric values
        df['label'] = df['relation'].map(self.get_label_id)

        # Text Data
        x_text = df['sentence'].tolist()

        # Label Data
        y = df['label'].values

        return x_text, y

    def get_label_id(self, relation):
        if relation not in self.labels_dict:
            self.labels_dict[relation] = len(self.labels_dict)
        return self.labels_dict[relation]

if __name__ == "__main__":
    preprocessor = TextPreprocessor()
    trainFile = '/content/TRAIN_FILE.TXT'
    testFile = '/content/TEST_FILE_FULL.TXT'

    x_train, y_train = preprocessor.load_data(trainFile)
    x_test, y_test = preprocessor.load_data(testFile)
    # Now we have x_train, y_train, x_test, and y_test for further processing.
    # Access and print the labels and their numeric IDs
    #labels_dict = preprocessor.labels_dict
    #for label, label_id in labels_dict.items():
        #print(f"{label},{label_id}")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
