In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-classification-finetuning/sample_submission.csv
/kaggle/input/llm-classification-finetuning/train.csv
/kaggle/input/llm-classification-finetuning/test.csv


In [2]:
# ============================================================
# LLM Classification Finetuning — Advanced Ensemble (No textstat)
# Goal: strong baseline with rich features + LR/XGBoost/LightGBM/RF ensemble
# Produces: submission.csv
# ============================================================

import re, warnings, numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
from scipy.sparse import hstack, csr_matrix

# XGBoost / LightGBM are preinstalled in Kaggle
import xgboost as xgb
import lightgbm as lgb

warnings.filterwarnings("ignore")

DATA_DIR = Path("/kaggle/input/llm-classification-finetuning")
assert DATA_DIR.exists(), "Add competition dataset on the right panel."

# ------------------------ Load ------------------------
train_df = pd.read_csv(DATA_DIR / "train.csv")
test_df  = pd.read_csv(DATA_DIR / "test.csv")
target_cols = ['winner_model_a', 'winner_model_b', 'winner_tie']

print("Shapes:", train_df.shape, test_df.shape)

# ------------------------ Targets ------------------------
def get_winner_label(row):
    if row['winner_model_a'] == 1: return 0
    if row['winner_model_b'] == 1: return 1
    return 2

train_df['winner_label'] = train_df.apply(get_winner_label, axis=1)
y = train_df['winner_label'].values
y_multiclass = train_df[target_cols].values

# ------------------------ Utilities ------------------------
def safe_len_words(s):
    if not isinstance(s, str) or not s: return 0
    return len(s.split())

def safe_len_chars(s):
    if not isinstance(s, str): return 0
    return len(s)

def split_sentences(s):
    if not isinstance(s, str) or not s: return []
    # split on ., !, ? while keeping it simple
    parts = re.split(r'[.!?]+', s)
    return [p.strip() for p in parts if p.strip()]

def estimate_syllables_word(word: str) -> int:
    """
    Very rough English syllable estimator (vowel groups).
    For non-Latin scripts, fall back to 1.
    """
    if not word:
        return 0
    if not re.search(r'[a-zA-Z]', word):
        return 1
    w = word.lower()
    w = re.sub(r'[^a-z]', '', w)
    if not w:
        return 1
    groups = re.findall(r'[aeiouy]+', w)
    count = len(groups)
    # silent 'e'
    if w.endswith('e') and len(groups) > 1:
        count -= 1
    return max(1, count)

def estimate_syllables_text(text: str) -> int:
    if not isinstance(text, str) or not text:
        return 0
    words = text.split()
    return int(sum(estimate_syllables_word(w) for w in words))

def flesch_reading_ease_proxy(text: str) -> float:
    """
    Flesch Reading Ease (proxy using our syllable estimator):
    206.835 - 1.015*(words/sentences) - 84.6*(syllables/words)
    Handles edge cases safely. Returns value clipped to [-50, 120].
    """
    if not isinstance(text, str) or not text.strip():
        return 0.0
    words = max(1, safe_len_words(text))
    sents = max(1, len(split_sentences(text)))
    sylls = max(1, estimate_syllables_text(text))
    fre = 206.835 - 1.015*(words/sents) - 84.6*(sylls/words)
    return float(np.clip(fre, -50, 120))

def fk_grade_proxy(text: str) -> float:
    """
    Flesch-Kincaid Grade Level (proxy):
    0.39*(words/sentences) + 11.8*(syllables/words) - 15.59
    Clipped to [0, 20]
    """
    if not isinstance(text, str) or not text.strip():
        return 0.0
    words = max(1, safe_len_words(text))
    sents = max(1, len(split_sentences(text)))
    sylls = max(1, estimate_syllables_text(text))
    grade = 0.39*(words/sents) + 11.8*(sylls/words) - 15.59
    return float(np.clip(grade, 0, 20))

# ------------------------ Feature Engineering ------------------------
def advanced_text_features(text):
    if not isinstance(text, str) or text == '':
        return {
            'char_count': 0, 'word_count': 0, 'sentence_count': 0,
            'avg_word_length': 0, 'question_count': 0, 'exclamation_count': 0,
            'uppercase_ratio': 0, 'digit_count': 0, 'special_char_count': 0,
            'readability_score': 0, 'grade_level': 0
        }
    char_count = len(text)
    words = text.split()
    word_count = len(words)
    sentences = split_sentences(text)
    sentence_count = len(sentences) if sentences else 1
    avg_word_length = float(np.mean([len(w) for w in words])) if words else 0.0
    question_count = text.count('?')
    exclamation_count = text.count('!')
    uppercase_ratio = sum(1 for c in text if c.isupper()) / max(1, len(text))
    digit_count = sum(1 for c in text if c.isdigit())
    special_char_count = len(re.findall(r'[^a-zA-Z0-9\s]', text))

    # proxies (no textstat)
    readability_score = flesch_reading_ease_proxy(text)
    grade_level = fk_grade_proxy(text)

    return {
        'char_count': char_count,
        'word_count': word_count,
        'sentence_count': sentence_count,
        'avg_word_length': avg_word_length,
        'question_count': question_count,
        'exclamation_count': exclamation_count,
        'uppercase_ratio': uppercase_ratio,
        'digit_count': digit_count,
        'special_char_count': special_char_count,
        'readability_score': readability_score,
        'grade_level': grade_level
    }

def create_advanced_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Prompt / A / B individual features
    prompt_feat = df['prompt'].apply(advanced_text_features)
    for k in prompt_feat.iloc[0].keys():
        df[f'prompt_{k}'] = [d[k] for d in prompt_feat]

    a_feat = df['response_a'].apply(advanced_text_features)
    for k in a_feat.iloc[0].keys():
        df[f'response_a_{k}'] = [d[k] for d in a_feat]

    b_feat = df['response_b'].apply(advanced_text_features)
    for k in b_feat.iloc[0].keys():
        df[f'response_b_{k}'] = [d[k] for d in b_feat]

    # Comparative features
    df['length_diff'] = df['response_a_char_count'] - df['response_b_char_count']
    df['length_ratio'] = df['response_a_char_count'] / (df['response_b_char_count'] + 1)
    df['word_diff'] = df['response_a_word_count'] - df['response_b_word_count']
    df['word_ratio'] = df['response_a_word_count'] / (df['response_b_word_count'] + 1)
    df['sentence_diff'] = df['response_a_sentence_count'] - df['response_b_sentence_count']
    df['readability_diff'] = df['response_a_readability_score'] - df['response_b_readability_score']
    df['grade_diff'] = df['response_a_grade_level'] - df['response_b_grade_level']

    # Simple quality indicators
    df['a_more_detailed'] = (df['response_a_char_count'] > df['response_b_char_count']).astype(int)
    df['a_more_questions'] = (df['response_a_question_count'] > df['response_b_question_count']).astype(int)
    df['a_more_readable'] = (df['response_a_readability_score'] > df['response_b_readability_score']).astype(int)
    df['a_better_grade'] = (df['response_a_grade_level'] < df['response_b_grade_level']).astype(int)

    # Interaction proxies
    df['prompt_response_a_similarity'] = df['prompt_word_count'] / (df['response_a_word_count'] + 1)
    df['prompt_response_b_similarity'] = df['prompt_word_count'] / (df['response_b_word_count'] + 1)

    # Texts for vectorization
    df['combined_text'] = df['prompt'].astype(str) + " [SEP] " + df['response_a'].astype(str) + " [SEP] " + df['response_b'].astype(str)
    df['response_comparison'] = df['response_a'].astype(str) + " [CMP] " + df['response_b'].astype(str)

    return df

print("Engineering features (train)…")
train_df = create_advanced_features(train_df)
print("Engineering features (test)…")
test_df  = create_advanced_features(test_df)

# ------------------------ Numeric feature lists ------------------------
numerical_features = [c for c in train_df.columns if any(x in c for x in [
    'char_count', 'word_count', 'sentence_count', 'avg_word_length',
    'question_count', 'exclamation_count', 'uppercase_ratio', 'digit_count',
    'special_char_count', 'readability_score', 'grade_level', '_diff', '_ratio',
    'more_detailed', 'more_questions', 'more_readable', 'better_grade', 'similarity'
])]

X_num = train_df[numerical_features].fillna(0.0).astype(np.float32)
X_num_test = test_df[numerical_features].fillna(0.0).astype(np.float32)

# Standardize numeric (dense)
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)
X_num_test_scaled = scaler.transform(X_num_test)

# ------------------------ TF-IDF blocks ------------------------
def make_tfidf(train_texts, test_texts, max_features, ngram=(1,2), analyzer='word', min_df=2, max_df=0.95):
    vec = TfidfVectorizer(max_features=max_features, ngram_range=ngram,
                          analyzer=analyzer, min_df=min_df, max_df=max_df, sublinear_tf=True,
                          stop_words='english')
    Xtr = vec.fit_transform(train_texts)
    Xte = vec.transform(test_texts)
    return Xtr, Xte

# Combined text
X_tfidf_combined, X_tfidf_combined_test = make_tfidf(
    train_df['combined_text'], test_df['combined_text'], max_features=3000, ngram=(1,3)
)

# Response comparison
X_tfidf_cmp, X_tfidf_cmp_test = make_tfidf(
    train_df['response_comparison'], test_df['response_comparison'], max_features=2000, ngram=(1,2)
)

# Char n-grams
char_vec_tr, char_vec_te = make_tfidf(
    train_df['combined_text'], test_df['combined_text'], max_features=1000, ngram=(2,4), analyzer='char', min_df=5, max_df=0.9
)

# Combine
X_combined = hstack([
    csr_matrix(X_num_scaled),
    X_tfidf_combined,
    X_tfidf_cmp,
    char_vec_tr
])
X_combined_test = hstack([
    csr_matrix(X_num_test_scaled),
    X_tfidf_combined_test,
    X_tfidf_cmp_test,
    char_vec_te
])

print("Combined features:", X_combined.shape, X_combined_test.shape)

# ------------------------ Train/Val split ------------------------
X_tr, X_val, y_tr, y_val = train_test_split(
    X_combined, y, test_size=0.2, random_state=42, stratify=y
)
_, _, y_tr_multi, y_val_multi = train_test_split(
    X_combined, y_multiclass, test_size=0.2, random_state=42, stratify=y
)

# ------------------------ Models ------------------------
# 1) Logistic Regression
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs',
                        max_iter=2000, random_state=42, C=0.5, class_weight='balanced')
lr.fit(X_tr, y_tr)
lr_pred = lr.predict_proba(X_val)
lr_loss = log_loss(y_val_multi, lr_pred)
print(f"LR logloss: {lr_loss:.4f}")

# 2) XGBoost
xgb_clf = xgb.XGBClassifier(
    n_estimators=220, max_depth=6, learning_rate=0.09,
    subsample=0.9, colsample_bytree=0.9, random_state=42,
    eval_metric='mlogloss', tree_method='hist'
)
xgb_clf.fit(X_tr, y_tr)
xgb_pred = xgb_clf.predict_proba(X_val)
xgb_loss = log_loss(y_val_multi, xgb_pred)
print(f"XGB logloss: {xgb_loss:.4f}")

# 3) LightGBM
lgb_clf = lgb.LGBMClassifier(
    n_estimators=300, max_depth=6, learning_rate=0.08,
    subsample=0.9, colsample_bytree=0.9, random_state=42,
    objective='multiclass', num_class=3, metric='multi_logloss', verbose=-1
)
lgb_clf.fit(X_tr, y_tr)
lgb_pred = lgb_clf.predict_proba(X_val)
lgb_loss = log_loss(y_val_multi, lgb_pred)
print(f"LGB logloss: {lgb_loss:.4f}")

# 4) Random Forest
rf = RandomForestClassifier(
    n_estimators=180, max_depth=12, min_samples_split=5, min_samples_leaf=2,
    class_weight='balanced', n_jobs=-1, random_state=42
)
rf.fit(X_tr, y_tr)
rf_pred = rf.predict_proba(X_val)
rf_loss = log_loss(y_val_multi, rf_pred)
print(f"RF logloss: {rf_loss:.4f}")

# Weighted ensemble (inverse-loss weights)
losses = np.array([lr_loss, xgb_loss, lgb_loss, rf_loss])
weights = 1.0 / np.maximum(1e-6, losses)
weights = weights / weights.sum()
print("Model weights (LR, XGB, LGB, RF):", np.round(weights, 3))

ens_val = np.average([lr_pred, xgb_pred, lgb_pred, rf_pred], axis=0, weights=weights)
ens_loss = log_loss(y_val_multi, ens_val)
print(f"Ensemble logloss: {ens_loss:.4f}")

# ------------------------ 5-fold CV on whole training ------------------------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
for fold, (tr_idx, vl_idx) in enumerate(skf.split(X_combined, y), 1):
    Xtr, Xvl = X_combined[tr_idx], X_combined[vl_idx]
    ytr, yvl = y[tr_idx], y[vl_idx]
    yvl_multi = y_multiclass[vl_idx]

    m1 = LogisticRegression(multi_class='multinomial', solver='lbfgs',
                            max_iter=2000, random_state=42, C=0.5, class_weight='balanced').fit(Xtr, ytr)
    m2 = xgb.XGBClassifier(
        n_estimators=220, max_depth=6, learning_rate=0.09,
        subsample=0.9, colsample_bytree=0.9, random_state=42,
        eval_metric='mlogloss', tree_method='hist'
    ).fit(Xtr, ytr)
    m3 = lgb.LGBMClassifier(
        n_estimators=300, max_depth=6, learning_rate=0.08,
        subsample=0.9, colsample_bytree=0.9, random_state=42,
        objective='multiclass', num_class=3, verbose=-1
    ).fit(Xtr, ytr)
    m4 = RandomForestClassifier(
        n_estimators=180, max_depth=12, min_samples_split=5, min_samples_leaf=2,
        class_weight='balanced', n_jobs=-1, random_state=42
    ).fit(Xtr, ytr)

    preds = [m1.predict_proba(Xvl), m2.predict_proba(Xvl), m3.predict_proba(Xvl), m4.predict_proba(Xvl)]
    fold_pred = np.average(preds, axis=0, weights=weights)
    fold_loss = log_loss(yvl_multi, fold_pred)
    cv_scores.append(fold_loss)
    print(f"Fold {fold} logloss: {fold_loss:.4f}")

print("CV mean ± 2*std:", np.mean(cv_scores), "±", 2*np.std(cv_scores))

# ------------------------ Final fit on full data ------------------------
final_lr  = LogisticRegression(multi_class='multinomial', solver='lbfgs',
                              max_iter=2000, random_state=42, C=0.5, class_weight='balanced').fit(X_combined, y)
final_xgb = xgb.XGBClassifier(
    n_estimators=220, max_depth=6, learning_rate=0.09,
    subsample=0.9, colsample_bytree=0.9, random_state=42,
    eval_metric='mlogloss', tree_method='hist'
).fit(X_combined, y)
final_lgb = lgb.LGBMClassifier(
    n_estimators=300, max_depth=6, learning_rate=0.08,
    subsample=0.9, colsample_bytree=0.9, random_state=42,
    objective='multiclass', num_class=3, verbose=-1
).fit(X_combined, y)
final_rf  = RandomForestClassifier(
    n_estimators=180, max_depth=12, min_samples_split=5, min_samples_leaf=2,
    class_weight='balanced', n_jobs=-1, random_state=42
).fit(X_combined, y)

# ------------------------ Predict test & save ------------------------
preds_test = np.average(
    [
        final_lr.predict_proba(X_combined_test),
        final_xgb.predict_proba(X_combined_test),
        final_lgb.predict_proba(X_combined_test),
        final_rf.predict_proba(X_combined_test),
    ],
    axis=0, weights=weights
)

submission = pd.DataFrame({
    'id': test_df['id'],
    'winner_model_a': preds_test[:, 0],
    'winner_model_b': preds_test[:, 1],
    'winner_tie':     preds_test[:, 2],
})

# small renorm (just in case)
row_sum = submission[['winner_model_a','winner_model_b','winner_tie']].sum(axis=1)
for c in ['winner_model_a','winner_model_b','winner_tie']:
    submission[c] = submission[c] / row_sum

submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")

Shapes: (57477, 9) (3, 4)
Engineering features (train)…
Engineering features (test)…
Combined features: (57477, 6046) (3, 6046)
LR logloss: 1.0895
XGB logloss: 1.0371
LGB logloss: 1.0379
RF logloss: 1.0516
Model weights (LR, XGB, LGB, RF): [0.242 0.254 0.254 0.25 ]
Ensemble logloss: 1.0383
Fold 1 logloss: 1.0421
Fold 2 logloss: 1.0423
Fold 3 logloss: 1.0389
Fold 4 logloss: 1.0418
Fold 5 logloss: 1.0446
CV mean ± 2*std: 1.0419416205513516 ± 0.0036310502623490225
Saved submission.csv
