# DataPreprocessor class


*   implementetion for Dataset abstract class of PyTorch

*   preprocessing the dataframe

In [1]:
import torch
import pandas as pd
from numpy import dtype
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')


class DataPreprocessor(Dataset):
    def __init__(self, csv_path):
        self.csv_path = csv_path
        self.data = pd.read_csv(csv_path)
        self.vocabulary = {}
        self.Tags_mapping = {}
        self.train_dataset = None
        self.val_dataset = None
        self.test_dataset = None
        self.Tag_column = None
        self.X = None
        self.y = None
        self.batch_size = None
        self.NumOfFeatures = None
        self.NumOfTags = None

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.X is not None and self.y is not None:
            return self.X.iloc[idx].values, self.y.iloc[idx]
        elif self.X is not None:
            return self.X.iloc[idx].values
        else:
            raise ValueError("Dataset is not properly initialized. Ensure X and y are assigned.")

    def clean_column_names(self):
        self.data.columns = (self.data.columns
                             .str.strip()
                             .str.replace(r"\s*\(.*?\)", "", regex=True)
                             .str.replace(r"[^\w\s]", "_", regex=True)
                             .str.replace(r"\s+", "_", regex=True)
                             .str.lower())

    def handle_missing_values(self, column_name, fill_value):
        if column_name not in self.data.columns:
            raise KeyError(f"Column '{column_name}' not found in dataset.")
        self.data[column_name] = self.data[column_name].fillna(fill_value)

    def handle_multivalued_cells(self):
        for column in self.data.select_dtypes(include=['object']).columns:
            updated_values = []
            for value in self.data[column]:
                if pd.notnull(value) and ";" in str(value):
                    updated_values.append(value.split(";"))
                elif pd.notnull(value):
                    updated_values.append(str(value))
                else:
                    updated_values.append([])
            self.data[column] = updated_values

    def define_label(self, Tag_column):
        self.Tag_column = Tag_column
        self.data = self.data.dropna(subset=[Tag_column])
        self.y = self.data[Tag_column]
        self.X = self.data.drop(columns=[Tag_column, "match_percentage"], errors='ignore')
        self.y = self.y.astype("category").cat.codes
        self.Tags_mapping = dict(
            enumerate(
                self.data[Tag_column].astype("category").cat.categories, start = 1
            )
        )
        self.NumOfTags = len(self.Tags_mapping)
        self.NumOfFeatures = len(self.X.columns)
        print("hey you")

    def split_data(self, train_frac=0.7, val_frac=0.15, test_frac=0.15, seed=42):
        np.random.seed(seed)
        indices = np.random.permutation(len(self.data))
        train_end = int(train_frac * len(self.data))
        val_end = train_end + int(val_frac * len(self.data))
        self.train_dataset = self.data.iloc[indices[:train_end]]
        self.val_dataset = self.data.iloc[indices[train_end:val_end]]
        self.test_dataset = self.data.iloc[indices[val_end:]]

    def get_datasets(self, name):
        if name == "train":
            return self.train_dataset
        elif name == "val":
            return self.val_dataset
        elif name == "test":
            return self.test_dataset
        else:
            raise ValueError("Dataset name is not valid")

    def get_test_Label(self):
        return self.test_dataset[self.Tag_column]

    def tokenize_texts(self, texts, max_vocab_size=10000):
        tokenized_texts = [word_tokenize(text.lower()) for text in texts]
        all_tokens = [token for text in tokenized_texts for token in text]
        vocab_counter = Counter(all_tokens).most_common(max_vocab_size)
        self.vocabulary = {word: idx + 1 for idx, (word, _) in enumerate(vocab_counter)}
        return [[self.vocabulary.get(word, 0) for word in text] for text in tokenized_texts]

    def pad_sequences(self, sequences):
        maxlen = max(len(seq) for seq in sequences)
        for i in range(len(sequences)):
            sequences[i] = sequences[i] + [0] * (maxlen - len(sequences[i]))
        return sequences

    def preprocess_dataset(self):
        self.data = self.data.astype(str)
        for column in self.data.columns:
            if self.data[column].dtype == 'object' or self.data[column].apply(lambda x: isinstance(x, str)).any():
                texts = self.data[column].tolist()
                tokenized_texts = self.tokenize_texts(texts)
                padded_texts = self.pad_sequences(tokenized_texts)
                self.data[column] = padded_texts


    def get_dataloader(dataset, batch_size, shuffle=True):
        tensor_dataset = TensorDataset(
            torch.tensor(dataset.X.values, dtype=torch.float32),
            torch.tensor(dataset.y.values, dtype=torch.long)
        )
        return DataLoader(tensor_dataset, batch_size=batch_size, shuffle=shuffle)

    # def get_dataloader(self, dataset, batch_size, shuffle=True):
    #     data = dataset.values
    #     if shuffle:
    #         np.random.shuffle(data)
    #     for i in range(0, len(data), batch_size):
    #         batch = data[i:i + batch_size]
    #         X_batch = [item[:-1] for item in batch]
    #         y_batch = [item[-1] for item in batch]
    #         yield np.array(X_batch), np.array(y_batch)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tzoharlary/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
