In [5]:
# imports
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


In [6]:
# load the data
train_df = pd.read_csv('data/kaggle/train.csv')
test_df = pd.read_csv('data/kaggle/test.csv')

In [7]:

# Prepare features and multi-label targets
X = train_df['comment_text']
y = train_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

In [8]:
# 3. Split into train/validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [9]:
# 4. Define baseline pipelines for each model
pipelines = {
    'lr_cv_baseline': Pipeline([
        ('vect', CountVectorizer()),
        ('clf', OneVsRestClassifier(LogisticRegression(max_iter=1000, n_jobs=-1)))
    ]),
    'lr_tv_baseline': Pipeline([
        ('vect', TfidfVectorizer()),
        ('clf', OneVsRestClassifier(LogisticRegression(max_iter=1000, n_jobs=-1)))
    ]),
    'rf_cv_baseline': Pipeline([
        ('vect', CountVectorizer()),
        ('clf', OneVsRestClassifier(RandomForestClassifier(n_jobs=-1)))
    ]),
    'rf_tv_baseline': Pipeline([
        ('vect', TfidfVectorizer()),
        ('clf', OneVsRestClassifier(RandomForestClassifier(n_jobs=-1)))
    ]),
}

In [10]:
# Train and evaluate each baseline model
for name, pipeline in pipelines.items():
    print(f"\n=== {name} - Training ===")
    pipeline.fit(X_train, y_train)
    print(f"=== {name} - Evaluation ===")
    y_pred = pipeline.predict(X_val)
    print(classification_report(y_val, y_pred, target_names=y.columns))


=== lr_cv_baseline - Training ===
=== lr_cv_baseline - Evaluation ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

        toxic       0.86      0.69      0.76      3056
 severe_toxic       0.51      0.24      0.33       321
      obscene       0.88      0.69      0.77      1715
       threat       0.36      0.23      0.28        74
       insult       0.76      0.50      0.60      1614
identity_hate       0.44      0.17      0.25       294

    micro avg       0.82      0.60      0.69      7074
    macro avg       0.63      0.42      0.50      7074
 weighted avg       0.80      0.60      0.68      7074
  samples avg       0.06      0.05      0.06      7074


=== lr_tv_baseline - Training ===
=== lr_tv_baseline - Evaluation ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

        toxic       0.91      0.61      0.73      3056
 severe_toxic       0.60      0.25      0.36       321
      obscene       0.92      0.62      0.74      1715
       threat       0.53      0.14      0.22        74
       insult       0.83      0.51      0.63      1614
identity_hate       0.74      0.14      0.24       294

    micro avg       0.88      0.55      0.68      7074
    macro avg       0.76      0.38      0.49      7074
 weighted avg       0.87      0.55      0.67      7074
  samples avg       0.06      0.05      0.05      7074


=== rf_cv_baseline - Training ===
=== rf_cv_baseline - Evaluation ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

        toxic       0.90      0.41      0.57      3056
 severe_toxic       0.54      0.10      0.17       321
      obscene       0.87      0.44      0.59      1715
       threat       0.67      0.08      0.14        74
       insult       0.86      0.32      0.46      1614
identity_hate       0.41      0.06      0.11       294

    micro avg       0.86      0.37      0.51      7074
    macro avg       0.71      0.24      0.34      7074
 weighted avg       0.84      0.37      0.51      7074
  samples avg       0.04      0.03      0.03      7074


=== rf_tv_baseline - Training ===
=== rf_tv_baseline - Evaluation ===
               precision    recall  f1-score   support

        toxic       0.90      0.41      0.57      3056
 severe_toxic       0.51      0.06      0.11       321
      obscene       0.87      0.43      0.57      1715
       threat       0.57      0.05      0.10        74
       insult       0.83      0.31      0.45 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Hyperparameter tuning setup, lr_cv & lr_tv
tuned_pipelines = {
    'lr_cv_tuned': pipelines['lr_cv_baseline'],
    'lr_tv_tuned': pipelines['lr_tv_baseline'],
}

param_grids = {
    'lr_cv_tuned': {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_df': [0.75, 0.9],
        'vect__min_df': [1, 2],
        'clf__estimator__C': [0.1, 1, 10]
    },
    'lr_tv_tuned': {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__max_df': [0.75, 0.9],
        'vect__min_df': [1, 2],
        'clf__estimator__C': [0.1, 1, 10]
    },
}

In [None]:
# Hyperparameter tuning setup, rf_cv & rf_tv


In [None]:
#Run GridSearchCV for each lr_tv and lr_cv
for name, pipeline in tuned_pipelines.items():
    print(f"\n--- Hyperparameter tuning: {name} ---")
    gs = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grids[name],
        scoring='f1_macro',
        cv=3,
        n_jobs=-1,
        verbose=2
    )
    gs.fit(X_train, y_train)
    print(f"Best params for {name}:", gs.best_params_)
    y_pred = gs.predict(X_val)
    print(classification_report(y_val, y_pred, target_names=y.columns))



--- Hyperparameter tuning: lr_cv_tuned ---
Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END clf__estimator__C=0.1, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 1); total time=  49.0s
[CV] END clf__estimator__C=0.1, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 1); total time=  50.2s
[CV] END clf__estimator__C=0.1, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 1); total time= 1.2min
[CV] END clf__estimator__C=0.1, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 1); total time= 1.2min
[CV] END clf__estimator__C=0.1, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 1); total time= 1.3min
[CV] END clf__estimator__C=0.1, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 1); total time=  48.8s
[CV] END clf__estimator__C=0.1, vect__max_df=0.9, vect__min_df=1, vect__ngram_range=(1, 1); total time= 1.3min
[CV] END clf__estimator__C=0.1, vect__max_df=0.9, vect__min_df=1, vect__ngram_range=(1, 1); total time= 1.5min
[

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END clf__estimator__C=10, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 1); total time= 4.4min
[CV] END clf__estimator__C=10, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 1); total time= 4.2min
[CV] END clf__estimator__C=1, vect__max_df=0.9, vect__min_df=2, vect__ngram_range=(1, 2); total time=10.5min
[CV] END clf__estimator__C=1, vect__max_df=0.9, vect__min_df=1, vect__ngram_range=(1, 2); total time=29.9min
[CV] END clf__estimator__C=1, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 2); total time=41.0min


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[CV] END clf__estimator__C=10, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 1); total time= 2.7min
[CV] END clf__estimator__C=1, vect__max_df=0.9, vect__min_df=1, vect__ngram_range=(1, 2); total time=30.0min
[CV] END clf__estimator__C=10, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 1); total time= 2.4min
[CV] END clf__estimator__C=10, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 1); total time= 2.6min
[CV] END clf__estimator__C=10, vect__max_df=0.9, vect__min_df=1, vect__ngram_range=(1, 1); total time= 3.9min
[CV] END clf__estimator__C=1, vect__max_df=0.9, vect__min_df=1, vect__ngram_range=(1, 2); total time=37.0min


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END clf__estimator__C=10, vect__max_df=0.9, vect__min_df=1, vect__ngram_range=(1, 1); total time= 3.9min
[CV] END clf__estimator__C=10, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 2); total time= 8.6min


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END clf__estimator__C=10, vect__max_df=0.9, vect__min_df=1, vect__ngram_range=(1, 1); total time= 4.0min


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END clf__estimator__C=10, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 2); total time=10.3min
[CV] END clf__estimator__C=10, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 2); total time=12.1min


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END clf__estimator__C=10, vect__max_df=0.9, vect__min_df=2, vect__ngram_range=(1, 1); total time= 2.9min
[CV] END clf__estimator__C=10, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 2); total time=21.5min


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END clf__estimator__C=10, vect__max_df=0.9, vect__min_df=2, vect__ngram_range=(1, 1); total time= 2.8min
[CV] END clf__estimator__C=10, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 2); total time=20.8min


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END clf__estimator__C=10, vect__max_df=0.9, vect__min_df=2, vect__ngram_range=(1, 1); total time= 2.6min
[CV] END clf__estimator__C=10, vect__max_df=0.9, vect__min_df=2, vect__ngram_range=(1, 2); total time= 8.4min
[CV] END clf__estimator__C=10, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 2); total time=29.2min


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END clf__estimator__C=10, vect__max_df=0.9, vect__min_df=2, vect__ngram_range=(1, 2); total time= 8.8min
[CV] END clf__estimator__C=10, vect__max_df=0.9, vect__min_df=2, vect__ngram_range=(1, 2); total time=10.0min
[CV] END clf__estimator__C=10, vect__max_df=0.9, vect__min_df=1, vect__ngram_range=(1, 2); total time=17.5min
[CV] END clf__estimator__C=10, vect__max_df=0.9, vect__min_df=1, vect__ngram_range=(1, 2); total time=16.0min
[CV] END clf__estimator__C=10, vect__max_df=0.9, vect__min_df=1, vect__ngram_range=(1, 2); total time=19.2min
Best params for lr_cv_tuned: {'clf__estimator__C': 10, 'vect__max_df': 0.75, 'vect__min_df': 1, 'vect__ngram_range': (1, 2)}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

        toxic       0.86      0.72      0.78      3056
 severe_toxic       0.50      0.29      0.37       321
      obscene       0.88      0.70      0.78      1715
       threat       0.47      0.32      0.38        74
       insult       0.79      0.58      0.67      1614
identity_hate       0.58      0.27      0.36       294

    micro avg       0.83      0.64      0.72      7074
    macro avg       0.68      0.48      0.56      7074
 weighted avg       0.82      0.64      0.72      7074
  samples avg       0.07      0.06      0.06      7074


--- Hyperparameter tuning: lr_tv_tuned ---
Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END clf__estimator__C=0.1, vect__max_df=0.75, vect__min_df=1, vect__ngram_range=(1, 1); total time=  10.1s
[CV] END clf__estimator__C=0.1, vect__max_df=0.75, vect__min_df=2, vect__ngram_range=(1, 1); total time=   9.4s
[CV] END clf__estimator__C=0.1, vect__max_df=0.75, vect__min_df

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

        toxic       0.89      0.68      0.77      3056
 severe_toxic       0.51      0.36      0.42       321
      obscene       0.90      0.69      0.78      1715
       threat       0.56      0.24      0.34        74
       insult       0.80      0.61      0.69      1614
identity_hate       0.65      0.25      0.36       294

    micro avg       0.85      0.63      0.72      7074
    macro avg       0.72      0.47      0.56      7074
 weighted avg       0.84      0.63      0.72      7074
  samples avg       0.06      0.06      0.06      7074


--- Hyperparameter tuning: rf_cv_tuned ---
Fitting 3 folds for each of 64 candidates, totalling 192 fits


KeyboardInterrupt: 

In [None]:
# Freeze best vectorizer settings
best_vect_params = {
    'ngram_range': (1, 2),
    'max_df': 0.9,
    'min_df': 1
}

# Define simplified RF pipelines (single-threaded inner estimator) to reduce runtime
rf_cv_tuned = Pipeline([
    ('vect', CountVectorizer(**best_vect_params)),
    ('clf', OneVsRestClassifier(
        RandomForestClassifier(random_state=42, n_jobs=1)
    ))
])

rf_tv_tuned = Pipeline([
    ('vect', TfidfVectorizer(**best_vect_params)),
    ('clf', OneVsRestClassifier(
        RandomForestClassifier(random_state=42, n_jobs=1)
    ))
])

# Slimmed-down RF hyperparameter grid
param_grid_rf = {
    'clf__estimator__n_estimators':      [100, 200],
    'clf__estimator__min_samples_split': [2,   5],
}

In [None]:
# %%
#  Run GridSearchCV for rf_cv and rf_tv

for name, pipeline in [('rf_cv_tuned', rf_cv_tuned), ('rf_tv_tuned', rf_tv_tuned)]:
    print(f"\n=== Hyperparameter tuning: {name} ===")
    gs = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid_rf,
        scoring='f1_macro',
        cv=2,            # fewer folds = faster
        n_jobs=-1,
        verbose=2
    )
    gs.fit(X_train, y_train)
    print(f"Best params for {name}:", gs.best_params_)
    
    # Evaluate on the validation set
    y_pred = gs.predict(X_val)
    print(f"--- Evaluation for {name} on validation set ---")
    print(classification_report(y_val, y_pred, target_names=y.columns))