In [None]:
%pip install -U scikit-learn sentence_transformers

In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from sentence_transformers import SentenceTransformer

from joblib import dump, load

from tqdm import tqdm
tqdm.pandas()

f_path = "/kaggle/input/kmaml223/"

In [None]:
train = pd.read_csv(f_path + "train.csv")
print(f"Train shape: {train.shape}")
print(f"Train columns: {train.columns}")
test = pd.read_csv(f_path + "test.csv")
print(f"Test shape: {test.shape}")
print(f"Test columns: {test.columns}")

In [None]:
model = SentenceTransformer("llmrails/ember-v1", device = "cuda")

In [None]:
train_embeddings = model.encode(train['comment_text'])
train_embeddings.shape

In [None]:
np.save("/kaggle/working/train_embeddings.npy", train_embeddings)

In [None]:
test_embeddings = model.encode(test['comment_text'])
test_embeddings.shape

In [None]:
test_embeddings.shape

In [None]:
np.save("/kaggle/working/test_embeddings.npy", test_embeddings)

In [None]:
train_embeddings = np.load("/kaggle/input/kmaml223-emb/train_embeddings.npy")
train_embeddings.shape

In [None]:
train_val_split = 0.9
split = int(train_embeddings.shape[0]*train_val_split)

X_train = train_embeddings[:split, :]
X_val = train_embeddings[split:, :]

Y_train = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']][:split]
Y_val = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']][split:]

In [None]:
import matplotlib.pyplot as plt

label_counts = Y_train.sum()

# Plot the bar chart
plt.figure(figsize=(10, 6))
label_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Distribution of Toxicity Labels in Y_train')
plt.xlabel('Toxicity Labels')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better visibility
plt.show()

In [None]:
import matplotlib.pyplot as plt

label_counts = Y_val.sum()

# Plot the bar chart
plt.figure(figsize=(10, 6))
label_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Distribution of Toxicity Labels in Y_val')
plt.xlabel('Toxicity Labels')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better visibility
plt.show()

In [None]:
log_models = []
log_f1s = []
for col in Y_train.columns:
    best_model = None
    best_f1 = 0
    for C in [10**i for i in range(-1, 4)]:
        model = LogisticRegression(max_iter = 5000, C = C)
        model.fit(X_train, Y_train[col])
        Y_pred = model.predict(X_val)

        f1 = f1_score(Y_val[col], Y_pred)
        
        if f1 > best_f1:
            best_f1 = f1
            best_model = model
        
        print(f"{col}, C: {C}")
        print(f"F1 score: {f1}")
    log_models.append(best_model)
    log_f1s.append(best_f1)
print(np.mean(log_f1s))

In [None]:
import os

log_models = {}
for file in os.listdir("/kaggle/input/lr-models-kmaml"):
    log_models[file.split(".")[0][3:]] = load("/kaggle/input/lr-models-kmaml/" + file)

In [None]:
for col in Y_val.columns:
    print(col)
    print(f"F1 score: {f1_score(log_models[col].predict(X_val), Y_val[col])}")
    print(f"Accuracy socre: {accuracy_score(log_models[col].predict(X_val), Y_val[col])}")
    print(f"Roc_Auc: {roc_auc_score(log_models[col].predict(X_val), Y_val[col])}")

In [None]:
for model, col in zip(log_models, Y_train.columns):
    dump(model, f'LR_{col}.joblib')