In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

path = '/kaggle/input/llm-classification-finetuning/'
df = pd.read_csv(path + 'train.csv')

print("Shape:", df.shape)
display(df.head())

print("\nLabel distribution (winner_model_a / b / tie):")
counts = {
    'model_a_win': df['winner_model_a'].sum(),
    'model_b_win': df['winner_model_b'].sum(),
    'tie': df['winner_tie'].sum()
}
print(counts)
plt.figure(figsize=(5,4))
sns.barplot(x=list(counts.keys()), y=list(counts.values()), palette='mako')
plt.title("Win/Tie Distribution")
plt.show()

print("\nMost frequent models in model_a:")
print(Counter(df['model_a']).most_common(5))
print("\nMost frequent models in model_b:")
print(Counter(df['model_b']).most_common(5))

print("\nWordClouds for prompts and responses")
text_prompt = " ".join(df['prompt'].astype(str).tolist()[:5000])
text_resp_a = " ".join(df['response_a'].astype(str).tolist()[:5000])
text_resp_b = " ".join(df['response_b'].astype(str).tolist()[:5000])

plt.figure(figsize=(15,6))
plt.subplot(1,3,1)
plt.imshow(WordCloud(width=600, height=400, background_color='white').generate(text_prompt))
plt.axis('off'); plt.title('Prompt')
plt.subplot(1,3,2)
plt.imshow(WordCloud(width=600, height=400, background_color='white').generate(text_resp_a))
plt.axis('off'); plt.title('Response A')
plt.subplot(1,3,3)
plt.imshow(WordCloud(width=600, height=400, background_color='white').generate(text_resp_b))
plt.axis('off'); plt.title('Response B')
plt.show()

print("\nExample comparison samples:")
for i in range(3):
    row = df.sample(1, random_state=i).iloc[0]
    print(f"\nPrompt:\n{row['prompt'][:200]}...")
    print(f"\nResponse A:\n{row['response_a'][:200]}...")
    print(f"\nResponse B:\n{row['response_b'][:200]}...")
    print(f"Winner: {'A' if row['winner_model_a']==1 else 'B' if row['winner_model_b']==1 else 'Tie'}")


### Does response length influence which model wins?

In [None]:
df['label'] = df['winner_model_a'] + 2 * df['winner_model_b'] + 3 * df['winner_tie']
df['len_a'] = df['response_a'].astype(str).apply(len)
df['len_b'] = df['response_b'].astype(str).apply(len)
df['len_diff'] = df['len_a'] - df['len_b']

plt.figure(figsize=(8,5))
sns.kdeplot(df[df['label']==1]['len_diff'], label='A win', fill=True)
sns.kdeplot(df[df['label']==2]['len_diff'], label='B win', fill=True)
sns.kdeplot(df[df['label']==3]['len_diff'], label='Tie', fill=True)
plt.axvline(0, color='black', linestyle='--', linewidth=1)
plt.xlim(-2000, 2000)
plt.title("Length Difference (A - B) by Outcome")
plt.xlabel("len_diff (positive = A longer)")
plt.legend()
plt.show()

plt.figure(figsize=(6,5))
sns.boxplot(x='label', y='len_diff', data=df)
plt.title("Length Difference Distribution by Label")
plt.xlabel("Label (1=A win, 2=B win, 3=Tie)")
plt.ylabel("len_diff")
plt.axhline(0, color='black', linestyle='--', linewidth=1)
plt.show()

print("\nAverage length difference by label:")
print(df.groupby('label')['len_diff'].describe()[['mean','std','min','max']])

a_win = df[df['label']==1]['len_diff']
b_win = df[df['label']==2]['len_diff']

stat, p_value = stats.mannwhitneyu(a_win, b_win, alternative='two-sided')
print("\nMann-Whitney U test (A win vs B win):")
print(f"Statistic = {stat:.2f}, p-value = {p_value:.4f}")

if p_value < 0.05:
    print("Significant difference in length when A wins vs B wins.")
else:
    print("No significant difference in length when A wins vs B wins.")


### Is response length enough to predict the winner?

In [None]:
X = df[['len_a', 'len_b', 'len_diff']]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

baseline = LogisticRegression(max_iter=1000)
baseline.fit(X_train, y_train)
y_pred = baseline.predict(X_test)

print("Baseline: Logistic Regression on length-based features")
print(classification_report(y_test, y_pred))

### Does wording itself help predict which response wins?

In [None]:
df['combined'] = df['prompt'] + " [SEP] " + df['response_a'] + " [VS] " + df['response_b']

X_train, X_test, y_train, y_test = train_test_split(
    df['combined'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

text_baseline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter=1000))
])

text_baseline.fit(X_train, y_train)
y_pred = text_baseline.predict(X_test)

print("Baseline: TF-IDF + Logistic Regression")
print(classification_report(y_test, y_pred))

The length-based model did a bit better, around 43% accuracy, mostly picking up that longer answers tend to win.
The TF-IDF one dropped to about 38%, so wording alone doesn’t seem to explain much — the real difference probably comes from how well each response is written overall.

In [None]:
df['combined'] = df['prompt'] + " [SEP] " + df['response_a'] + " [VS] " + df['response_b']
X_train = df['combined']
y_train = df['label']

text_baseline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter=1000))
])

text_baseline.fit(X_train, y_train)

test_df = pd.read_csv(path + 'test.csv')
test_df['combined'] = test_df['prompt'].fillna('') + " [SEP] " + \
                      test_df['response_a'].fillna('') + " [VS] " + \
                      test_df['response_b'].fillna('')

probs_test = text_baseline.predict_proba(test_df['combined'])

submission = pd.DataFrame({
    'id': test_df['id'],
    'winner_model_a': probs_test[:, 0],
    'winner_model_b': probs_test[:, 1],
    'winner_tie': probs_test[:, 2]
})

submission.to_csv('/kaggle/working/submission.csv', index=False)
submission.head()