In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss

In [None]:
train_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
test_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")
submission_template = pd.read_csv("/kaggle/input/llm-classification-finetuning/sample_submission.csv")

train_df.head()

In [None]:
plt.figure(figsize=(6,4))
sns.barplot(data=train_df[['winner_model_a','winner_model_b','winner_tie']].sum().reset_index(),
            x='index', y=0, palette='Set2')
plt.title("Distribusi Label (Jumlah Positif per Kelas)")
plt.xlabel("Label")
plt.ylabel("Jumlah")
plt.show()

# Panjang teks
train_df['prompt_len'] = train_df['prompt'].apply(lambda x: len(str(x).split()))
train_df['resp_a_len'] = train_df['response_a'].apply(lambda x: len(str(x).split()))
train_df['resp_b_len'] = train_df['response_b'].apply(lambda x: len(str(x).split()))

fig, axs = plt.subplots(1, 3, figsize=(18, 4))
sns.histplot(train_df['prompt_len'], kde=True, ax=axs[0], color='skyblue')
axs[0].set_title("Panjang Prompt")

sns.histplot(train_df['resp_a_len'], kde=True, ax=axs[1], color='lightgreen')
axs[1].set_title("Panjang Response A")

sns.histplot(train_df['resp_b_len'], kde=True, ax=axs[2], color='salmon')
axs[2].set_title("Panjang Response B")

plt.tight_layout()
plt.show()

print(train_df[['winner_model_a','winner_model_b','winner_tie']].value_counts(normalize=True))


In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 2]
    return " ".join(tokens)

for col in ['prompt', 'response_a', 'response_b']:
    train_df[col] = train_df[col].apply(clean_text)
    test_df[col] = test_df[col].apply(clean_text)

# Gabungkan jadi satu kolom input
train_df['full_text'] = train_df['prompt'] + " " + train_df['response_a'] + " " + train_df['response_b']
test_df['full_text'] = test_df['prompt'] + " " + test_df['response_a'] + " " + test_df['response_b']


In [None]:
vectorizer = TfidfVectorizer(max_features=15000)
X = vectorizer.fit_transform(train_df['full_text'])
y = train_df[['winner_model_a', 'winner_model_b', 'winner_tie']]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

base_lr = LogisticRegression(max_iter=500, solver='saga')
model = MultiOutputClassifier(CalibratedClassifierCV(base_lr, method='sigmoid', cv=3))
model.fit(X_train, y_train)

In [None]:
val_preds = np.stack([model.estimators_[i].predict_proba(X_val)[:, 1] for i in range(3)], axis=1)
val_loss = log_loss(y_val, val_preds)
print(f"Validation Log Loss: {val_loss:.5f}")

In [None]:
y_pred_val = model.predict(X_val)

f1_a = f1_score(y_val['winner_model_a'], y_pred_val[:, 0])
f1_b = f1_score(y_val['winner_model_b'], y_pred_val[:, 1])
f1_t = f1_score(y_val['winner_tie'], y_pred_val[:, 2])

print(f"F1 Score - Model A : {f1_a:.4f}")
print(f"F1 Score - Model B : {f1_b:.4f}")
print(f"F1 Score - Tie     : {f1_t:.4f}")

X_test = vectorizer.transform(test_df['full_text'])

test_preds = np.stack([model.estimators_[i].predict_proba(X_test)[:, 1] for i in range(3)], axis=1)

# Clipping untuk hindari proba ekstrim
eps = 1e-6
test_preds = np.clip(test_preds, eps, 1 - eps)

In [None]:
X_test = vectorizer.transform(test_df['full_text'])
pred_proba = model.predict_proba(X_test)

submission = pd.DataFrame({
    'id': test_df['id'],
    'winner_model_a': pred_proba[0][:, 1],
    'winner_model_b': pred_proba[1][:, 1],
    'winner_tie': pred_proba[2][:, 1],
})

submission.to_csv("submission.csv", index=False)
submission.head()