### Importing Libraries

In [36]:
import torch
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import TensorDataset, DataLoader

### Data Preprocessing

In [30]:
# Load dataset (adjust path if needed)
df = pd.read_csv("../data/jigsaw dataset kaggle/train.csv")

# Look at first few rows
print(df.head())

# See how many samples and columns
print("\nShape:", df.shape)


                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  

Shape: (159571, 8)


In [31]:
# Multi-label columns 
multi_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Binary target: toxic if any label = 1
df['binary_target'] = (df[multi_labels].sum(axis=1) > 0).astype(int)

print("\nBinary target distribution:")
print(df['binary_target'].value_counts(normalize=True))

# Drop missing text just in case
df = df.dropna(subset=['comment_text']).reset_index(drop=True)

print(f"\nFinal dataset size: {len(df)}")



Binary target distribution:
binary_target
0    0.898321
1    0.101679
Name: proportion, dtype: float64

Final dataset size: 159571


### TF-IDF Vectorization

In [40]:
# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=5000,   # keep top 10k words
    stop_words='english', # ignore very common words
    lowercase=True
)

# Fit on all comment text and transform
X_tfidf = vectorizer.fit_transform(df['comment_text']).astype('float32')


print("TF-IDF shape:", X_tfidf.shape)


TF-IDF shape: (159571, 5000)


### Prepare Tensors and DataLoader

In [43]:
from torch.utils.data import Dataset, DataLoader

class SparseDataset(Dataset):
    def __init__(self, X_sparse, y_bin, y_multi):
        self.X_sparse = X_sparse
        self.y_bin = torch.tensor(y_bin.values, dtype=torch.float32).unsqueeze(1)
        self.y_multi = torch.tensor(y_multi.values, dtype=torch.float32)

    def __len__(self):
        return self.X_sparse.shape[0]

    def __getitem__(self, idx):
        x = torch.tensor(self.X_sparse[idx].toarray(), dtype=torch.float32).squeeze(0)
        y_bin = self.y_bin[idx]
        y_multi = self.y_multi[idx]
        return x, y_bin, y_multi

# Create dataset
dataset = SparseDataset(X_tfidf, df['binary_target'], df[multi_labels])
train_loader = DataLoader(dataset, batch_size=64, shuffle=True)
