In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from scipy.sparse import hstack
from catboost import CatBoostClassifier
import optuna

# Kaggle veri yolları
train_path = '/kaggle/input/llm-classification-finetuning/train.csv'
test_path = '/kaggle/input/llm-classification-finetuning/test.csv'
submission_path = '/kaggle/input/llm-classification-finetuning/sample_submission.csv'

# Verileri yükleme
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
submission = pd.read_csv(submission_path)

# Winner sütununu oluşturma
train['winner'] = train[['winner_model_a', 'winner_model_b', 'winner_tie']].idxmax(axis=1).map({
    'winner_model_a': 0, 'winner_model_b': 1, 'winner_tie': 2
})

# Giriş ve hedef değişkenleri ayırma
X = train['prompt'] + " " + train['response_a'] + " " + train['response_b']
y = train['winner']

# Eğitim ve doğrulama seti
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Özellik Çıkarımı
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

# Test seti için TF-IDF çıkarımı
test_X = test['prompt'] + " " + test['response_a'] + " " + test['response_b']
test_tfidf = tfidf_vectorizer.transform(test_X)

# Optuna ile Hiperparametre Optimizasyonu
def objective(trial):
    # Hiperparametreler
    params = {
        "iterations": trial.suggest_int("iterations", 100, 1000, step=100),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10, log=True),
        "random_strength": trial.suggest_float("random_strength", 0.1, 2),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "task_type": "GPU",
        "devices": "0",
        "loss_function": "MultiClass",
        "eval_metric": "Accuracy",
        "verbose": 0
    }

    # Model eğitimi
    cat_model = CatBoostClassifier(**params)
    cat_model.fit(X_train_tfidf, y_train, eval_set=(X_val_tfidf, y_val), early_stopping_rounds=50, verbose=0)
    accuracy = cat_model.best_score_["validation"]["Accuracy"]

    return accuracy

# Optuna çalıştırma
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# En iyi hiperparametreler
best_params = study.best_params
print("Best Parameters:", best_params)

# En iyi model ile eğitim
final_model = CatBoostClassifier(**best_params, task_type="GPU", devices="0", loss_function="MultiClass")
final_model.fit(X_train_tfidf, y_train, eval_set=(X_val_tfidf, y_val), early_stopping_rounds=50)

# Doğrulama seti tahmini
y_pred = final_model.predict(X_val_tfidf)
print(classification_report(y_val, y_pred))

# Test seti tahmini
test['winner'] = final_model.predict(test_tfidf)

# Tahminleri yarışma formatına uygun hale getirme
test['winner_model_a'] = (test['winner'] == 0).astype(int)
test['winner_model_b'] = (test['winner'] == 1).astype(int)
test['winner_tie'] = (test['winner'] == 2).astype(int)

submission = test[['id', 'winner_model_a', 'winner_model_b', 'winner_tie']]
submission.to_csv('submission.csv', index=False)
