In [None]:
import pandas as pd

# adjust path as needed
train_df = pd.read_csv("data/kaggle/train.csv")
test_df  = pd.read_csv("data/kaggle/test.csv")

print("Train shape:", train_df.shape)
print(train_df.head())
print(train_df.isnull().sum())


In [None]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
print(train_df[label_cols].mean())


In [None]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)           # strip HTML tags
    text = re.sub(r"https?://\S+", " ", text)    # strip URLs
    text = re.sub(r"[^a-z\s]", " ", text)        # keep only letters & spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

train_df['comment_text'] = train_df['comment_text'].apply(clean_text)
test_df['comment_text']  = test_df['comment_text'].apply(clean_text)


In [None]:

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train_df['comment_text'],
    train_df[label_cols],
    test_size=0.1,
    random_state=42
)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    max_features=10_000, 
    stop_words='english', 
    ngram_range=(1,2)
)

X_train_bow = vectorizer.fit_transform(X_train)
X_val_bow   = vectorizer.transform(X_val)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score

# initialize
base_clf = LogisticRegression(C=1.0, max_iter=200)
model = OneVsRestClassifier(base_clf, n_jobs=-1)

# fit
model.fit(X_train_bow, y_train)

# predict probabilities on validation
y_val_pred = model.predict_proba(X_val_bow)

# compute AUC for each label
for i, label in enumerate(label_cols):
    auc = roc_auc_score(y_val[label], y_val_pred[:, i])
    print(f"{label} AUC: {auc:.4f}")


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score

# initialize
base_clf = LogisticRegression(C=1.0, max_iter=200)
model = OneVsRestClassifier(base_clf, n_jobs=-1)

# fit
model.fit(X_train_bow, y_train)

# predict probabilities on validation
y_val_pred = model.predict_proba(X_val_bow)

# compute AUC for each label
for i, label in enumerate(label_cols):
    auc = roc_auc_score(y_val[label], y_val_pred[:, i])
    print(f"{label} AUC: {auc:.4f}")


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('clf', OneVsRestClassifier(LogisticRegression(), n_jobs=-1))
])

param_grid = {
    'vect__max_features': [5_000, 10_000, 20_000],
    'vect__ngram_range': [(1,1), (1,2)],
    'clf__estimator__C': [0.1, 1.0, 10.0]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    scoring='roc_auc',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid.fit(X_train, y_train)
print("Best params:", grid.best_params_)


In [30]:
# retrain on full data
full_vect = CountVectorizer(max_features=10_000, stop_words='english', ngram_range=(1,2))
X_full = full_vect.fit_transform(train_df['comment_text'])

final_clf = OneVsRestClassifier(LogisticRegression(C=1.0, max_iter=200), n_jobs=-1)
final_clf.fit(X_full, train_df[label_cols])

# predict on test
X_test = full_vect.transform(test_df['comment_text'])
probs = final_clf.predict_proba(X_test)

sub = pd.DataFrame(probs, columns=label_cols)
sub.insert(0, 'id', test_df['id'])


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [31]:
# --- 1) Quick look at your “submission” table ---
print("First 5 predictions:")
print(sub.head(), "\n")

print("Summary statistics of predicted probabilities:")
print(sub.describe().T, "\n")

# --- 2) If you want to evaluate on the scored subset: ---
# (make sure you have loaded test_labels.csv already)
import pandas as pd
from sklearn.metrics import roc_auc_score, classification_report

test_labels = pd.read_csv("data/kaggle/test_labels.csv")      # adjust path
label_cols = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

# pick only those rows where test_labels != -1 for all labels
mask = (test_labels[label_cols] != -1).all(axis=1)
scored = test_labels[mask].set_index('id')

# align predictions to the scored IDs
preds = sub.set_index('id').loc[scored.index, label_cols]

# compute per-label ROC AUC
print("ROC AUC per label:")
for col in label_cols:
    auc = roc_auc_score(scored[col], preds[col])
    print(f"  {col:15s}: {auc:.4f}")

# classification report @ 0.5 threshold
binary_preds = (preds >= 0.5).astype(int)
print("\nClassification Report (threshold=0.5):")
print(classification_report(scored, binary_preds, target_names=label_cols))


First 5 predictions:
                 id     toxic  severe_toxic   obscene    threat    insult  \
0  00001cee341fdb12  1.000000      0.029503  1.000000  0.039739  0.999935   
1  0000247867823ef7  0.009600      0.005285  0.002576  0.001493  0.004915   
2  00013b17ad220c46  0.056865      0.005284  0.022372  0.001323  0.032402   
3  00017563c3f7919a  0.000966      0.000185  0.000952  0.000056  0.000715   
4  00017695ad8997eb  0.023909      0.007190  0.010391  0.003560  0.035509   

   identity_hate  
0       0.632566  
1       0.002123  
2       0.011014  
3       0.000020  
4       0.008936   

Summary statistics of predicted probabilities:
                  count      mean       std           min       25%       50%  \
toxic          153164.0  0.197519  0.333026  8.125400e-75  0.000996  0.021087   
severe_toxic   153164.0  0.017504  0.082349  0.000000e+00  0.000056  0.001564   
obscene        153164.0  0.116715  0.270034  0.000000e+00  0.000658  0.007829   
threat         153164.0  0.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
