In [13]:
# imports
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


In [14]:
# load the data
train_df = pd.read_csv('data/kaggle/train.csv')
test_df = pd.read_csv('data/kaggle/test.csv')

In [15]:

# Prepare features and multi-label targets
X = train_df['comment_text']
y = train_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

In [16]:
# 3. Split into train/validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [17]:
# 4. Define baseline pipelines for each model
pipelines = {
    'lr_cv_baseline': Pipeline([
        ('vect', CountVectorizer()),
        ('clf', OneVsRestClassifier(LogisticRegression(max_iter=1000, n_jobs=-1)))
    ]),
    'lr_tv_baseline': Pipeline([
        ('vect', TfidfVectorizer()),
        ('clf', OneVsRestClassifier(LogisticRegression(max_iter=1000, n_jobs=-1)))
    ]),
    'rf_cv_baseline': Pipeline([
        ('vect', CountVectorizer()),
        ('clf', OneVsRestClassifier(RandomForestClassifier(n_jobs=-1)))
    ]),
    'rf_tv_baseline': Pipeline([
        ('vect', TfidfVectorizer()),
        ('clf', OneVsRestClassifier(RandomForestClassifier(n_jobs=-1)))
    ]),
}

In [None]:
# Train and evaluate each baseline model
for name, pipeline in pipelines.items():
    print(f"\n=== {name} - Training ===")
    pipeline.fit(X_train, y_train)
    print(f"=== {name} - Evaluation ===")
    y_pred = pipeline.predict(X_val)
    print(classification_report(y_val, y_pred, target_names=y.columns))

In [None]:
# Hyperparameter tuning setup, lr_cv & lr_tv
tuned_pipelines = {
    'lr_cv_tuned': pipelines['lr_cv_baseline'],
    'lr_tv_tuned': pipelines['lr_tv_baseline'],
}

param_grids = {
    'lr_cv_tuned': {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_df': [0.75, 0.9],
        'vect__min_df': [1, 2],
        'clf__estimator__C': [0.1, 1, 10]
    },
    'lr_tv_tuned': {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_df': [0.75, 0.9],
        'vect__min_df': [1, 2],
        'clf__estimator__C': [0.1, 1, 10]
    },
}

In [None]:
# Hyperparameter tuning setup, rf_cv & rf_tv


In [None]:
#Run GridSearchCV for each lr_tv and lr_cv
for name, pipeline in tuned_pipelines.items():
    print(f"\n--- Hyperparameter tuning: {name} ---")
    gs = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grids[name],
        scoring='f1_macro',
        cv=3,
        n_jobs=-1,
        verbose=2
    )
    gs.fit(X_train, y_train)
    print(f"Best params for {name}:", gs.best_params_)
    y_pred = gs.predict(X_val)
    print(classification_report(y_val, y_pred, target_names=y.columns))


In [18]:
# Freeze best vectorizer settings
best_vect_params = {
    'ngram_range': (1, 2),
    'max_df': 0.9,
    'min_df': 1
}

# Define simplified RF pipelines (single-threaded inner estimator) to reduce runtime
rf_cv_tuned = Pipeline([
    ('vect', CountVectorizer(**best_vect_params)),
    ('clf', OneVsRestClassifier(
        RandomForestClassifier(random_state=42, n_jobs=1)
    ))
])

rf_tv_tuned = Pipeline([
    ('vect', TfidfVectorizer(**best_vect_params)),
    ('clf', OneVsRestClassifier(
        RandomForestClassifier(random_state=42, n_jobs=1)
    ))
])

# Slimmed-down RF hyperparameter grid
param_grid_rf = {
    'clf__estimator__n_estimators':      [100, 200],
    'clf__estimator__min_samples_split': [2,   5],
}

In [25]:
# %%
#  Run GridSearchCV for rf_cv and rf_tv
# reduce runtime by using smaller amount of training data
X_train_sub, _, y_train_sub, _ = train_test_split(
    X_train,
    y_train,
    train_size=0.3,
    random_state=42,
    shuffle=True
)

#  run  RF GridSearchCV on X_train_sub / y_train_sub
for name, pipeline in [('rf_cv_tuned', rf_cv_tuned), ('rf_tv_tuned', rf_tv_tuned)]:
    print(f"\n=== Hyperparameter tuning: {name} (30% subsample) ===")
    gs = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid_rf,
        scoring='f1_macro',
        cv=2,
        n_jobs=-1,
        verbose=2
    )
    gs.fit(X_train_sub, y_train_sub)
    print(f"Best params for {name}:", gs.best_params_)
    y_pred = gs.predict(X_val)
    print(classification_report(y_val, y_pred, target_names=y.columns))
    
    # Evaluate on the **full** validation set
    y_pred = gs.predict(X_val)
    print(f"--- Evaluation for {name} on validation set ---")
    print(classification_report(y_val, y_pred, target_names=y.columns))


=== Hyperparameter tuning: rf_cv_tuned (30% subsample) ===
Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV] END clf__estimator__min_samples_split=5, clf__estimator__n_estimators=100; total time= 5.3min
[CV] END clf__estimator__min_samples_split=5, clf__estimator__n_estimators=100; total time= 5.6min
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100; total time= 7.5min
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100; total time= 7.8min
[CV] END clf__estimator__min_samples_split=5, clf__estimator__n_estimators=200; total time= 9.1min
[CV] END clf__estimator__min_samples_split=5, clf__estimator__n_estimators=200; total time= 9.4min
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=200; total time=11.1min
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=200; total time=11.3min
Best params for rf_cv_tuned: {'clf__estimator__min_samples_split': 2, 'clf__estimator__n_

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

        toxic       0.98      0.30      0.46      3056
 severe_toxic       0.57      0.02      0.05       321
      obscene       0.96      0.37      0.54      1715
       threat       0.00      0.00      0.00        74
       insult       0.90      0.22      0.36      1614
identity_hate       0.60      0.01      0.02       294

    micro avg       0.95      0.27      0.42      7074
    macro avg       0.67      0.16      0.24      7074
 weighted avg       0.91      0.27      0.41      7074
  samples avg       0.03      0.02      0.02      7074

--- Evaluation for rf_cv_tuned on validation set ---
               precision    recall  f1-score   support

        toxic       0.98      0.30      0.46      3056
 severe_toxic       0.57      0.02      0.05       321
      obscene       0.96      0.37      0.54      1715
       threat       0.00      0.00      0.00        74
       insult       0.90      0.22      0.36      1614
identity

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[CV] END clf__estimator__min_samples_split=5, clf__estimator__n_estimators=100; total time= 4.5min
[CV] END clf__estimator__min_samples_split=5, clf__estimator__n_estimators=100; total time= 4.7min
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100; total time= 6.1min
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=100; total time= 6.3min
[CV] END clf__estimator__min_samples_split=5, clf__estimator__n_estimators=200; total time= 7.6min
[CV] END clf__estimator__min_samples_split=5, clf__estimator__n_estimators=200; total time= 7.9min
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=200; total time= 9.3min
[CV] END clf__estimator__min_samples_split=2, clf__estimator__n_estimators=200; total time= 9.5min
Best params for rf_tv_tuned: {'clf__estimator__min_samples_split': 2, 'clf__estimator__n_estimators': 200}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

        toxic       0.97      0.31      0.47      3056
 severe_toxic       0.50      0.02      0.04       321
      obscene       0.96      0.38      0.54      1715
       threat       0.00      0.00      0.00        74
       insult       0.90      0.24      0.37      1614
identity_hate       0.67      0.01      0.01       294

    micro avg       0.95      0.28      0.43      7074
    macro avg       0.67      0.16      0.24      7074
 weighted avg       0.91      0.28      0.42      7074
  samples avg       0.03      0.02      0.02      7074

--- Evaluation for rf_tv_tuned on validation set ---
               precision    recall  f1-score   support

        toxic       0.97      0.31      0.47      3056
 severe_toxic       0.50      0.02      0.04       321
      obscene       0.96      0.38      0.54      1715
       threat       0.00      0.00      0.00        74
       insult       0.90      0.24      0.37      1614
identity

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
