In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

In [None]:
# load & clean
train_df = pd.read_csv(
    "sample_data/train.csv",
    engine="python",            # switch to the pure-Python parser
    on_bad_lines="skip",        # skip any malformed row
    sep=",",                    # explicit separator
    quotechar='"',              # default quote char
)
print("Loaded rows:", train_df.shape[0])




label_cols = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
train_df['comment_text'] = (
    train_df['comment_text']
      .str.lower()
      .str.replace(r"<.*?>", " ", regex=True)
      .str.replace(r"https?://\S+", " ", regex=True)
      .str.replace(r"[^a-z\s]", " ", regex=True)
      .str.replace(r"\s+", " ", regex=True)
      .str.strip()
)

In [None]:
# Change frac= to test hyperparameters at shorter runtime
train_small = train_df.sample(frac=1.0, random_state=42)

In [None]:
#Split into train/val (80/20 of the small slice)
X = train_small['comment_text'].to_numpy()
y = train_small[label_cols].to_numpy()
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Embed with a compact SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda') 
X_train_emb = model.encode(
    X_train, batch_size=64, show_progress_bar=True, convert_to_numpy=True
)
X_val_emb = model.encode(
    X_val,   batch_size=64, show_progress_bar=True, convert_to_numpy=True
)

# shape check
print("Train emb:", X_train_emb.shape, "Val emb:", X_val_emb.shape)

In [None]:
#Train & evaluate
clf = OneVsRestClassifier(LogisticRegression(max_iter=500), n_jobs=-1)
clf.fit(X_train_emb, y_train)

y_val_prob = clf.predict_proba(X_val_emb)
y_val_pred = (y_val_prob >= 0.5).astype(int)

print("ROC-AUC per label:")
for i,l in enumerate(label_cols):
    print(f"  {l:15s}: {roc_auc_score(y_val[:,i], y_val_prob[:,i]):.4f}")

print("\nClassification Report @0.5:")
print(classification_report(y_val, y_val_pred, target_names=label_cols))