<h1>Cloning GitHub Repo</h1>

In [None]:
#----------Cloning GitHub Repo------------
#!git clone https://github.com/SimoneRoma21/PKMN_VictoryBinaryClassification.git
!git pull

In [None]:
#---------Appending the right path----------
import sys

# Make sure to replace the repository folder with the actual folder name
repo_path = ['/kaggle/working/PKMN_VictoryBinaryClassification/py','/kaggle/working/PKMN_VictoryBinaryClassification/data']
if repo_path not in sys.path:
    sys.path.append(repo_path)
print(sys.path)

%cd /kaggle/working/PKMN_VictoryBinaryClassification/py

<H1>Imports</H1>

In [None]:
import json
import pandas as pd
from dataset.dataset_construction import Feature, FeaturePipeline
from dataset.csv_utilities import *
from dataset.extract_utilities import *
from ModelTrainer import ModelTrainer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns

<H1>Feature Selected</H1>

In [None]:
selected_features = [
        
        #------Stats Features---------#
        Feature.P1_FINAL_TEAM_HP, 
        Feature.P2_FINAL_TEAM_HP, 
        Feature.MEAN_SPE_LAST, 
        Feature.MEAN_HP_LAST, 
        Feature.MEAN_ATK_LAST, 
        Feature.MEAN_SPA_LAST, 
        Feature.MEAN_STATS_LAST, 
        Feature.MEAN_CRIT,

        #------Ratio on Stats Features--------#
        Feature.HP_BULK_RATIO,
        Feature.SPE_ATK_RATIO,
        Feature.OFF_DEF_RATIO,
        Feature.OFF_SPAD_RATIO,

        #-------Differential Features on Stats---#
        Feature.HP_TREND_DIFF,
        Feature.ATK_TREND_DIFF,
        Feature.SPA_TREND_DIFF,
        Feature.SPE_TREND_DIFF,
    
        #---Feature Infos During Battle----#
        Feature.P1_ALIVE_PKMN, 
        Feature.P2_ALIVE_PKMN, 
        Feature.P1_SWITCHES_COUNT, 
        Feature.P2_SWITCHES_COUNT,
    
        Feature.P1_AVG_HP_WHEN_SWITCHING, 
        Feature.P2_AVG_HP_WHEN_SWITCHING, 
        Feature.P1_MAX_DEBUFF_RECEIVED,
        Feature.P2_MAX_DEBUFF_RECEIVED,
        Feature.P1_AVG_MOVE_POWER, 
        Feature.P2_AVG_MOVE_POWER, 
        Feature.AVG_MOVE_POWER_DIFFERENCE, 
        Feature.P1_OFFENSIVE_RATIO, 
        Feature.P2_OFFENSIVE_RATIO, 
        Feature.OFFENSIVE_RATIO_DIFFERENCE, 
        Feature.P1_MOVED_FIRST_COUNT, 
        Feature.P2_MOVED_FIRST_COUNT, 
        Feature.SPEED_ADVANTAGE_RATIO, 

        #----Feature Status of Pokemons----#
        Feature.P1_FROZEN_PKMN, 
        Feature.P2_FROZEN_PKMN, 
        Feature.P1_PARALIZED_PKMN, 
        Feature.P2_PARALIZED_PKMN, 
        Feature.P1_SLEEP_PKMN, 
        Feature.P2_SLEEP_PKMN, 
        Feature.P1_POISON_PKMN, 
        Feature.P2_POISON_PKMN,  
        Feature.P1_BURNED_PKMN, 
        Feature.P2_BURNED_PKMN, 
        
        #----Feature Pokemon Moves----#
        Feature.P1_PKMN_REFLECT, 
        Feature.P2_PKMN_REFLECT, 
        Feature.P1_PKMN_REST, 
        Feature.P2_PKMN_REST, 
        Feature.P1_PKMN_EXPLOSION, 
        Feature.P2_PKMN_EXPLOSION, 
        Feature.P1_PKMN_THUNDERWAVE, 
        Feature.P2_PKMN_THUNDERWAVE, 
        Feature.P1_PKMN_RECOVER, 
        Feature.P2_PKMN_RECOVER, 
        Feature.P1_PKMN_TOXIC, 
        Feature.P2_PKMN_TOXIC, 
        Feature.P1_PKMN_FIRESPIN, 
        Feature.P2_PKMN_FIRESPIN,          
    ]
    

<h1>Main code for training and evaluation</h1>

In [None]:
def evaluate_test_set(trainer: ModelTrainer, feature_list: list, test_file_path: str):

    feature_pipeline = FeaturePipeline(feature_list, cache_dir="../data/test_feature_cache")

    print("\nLoading test data...")
    test_data = []
    with open(test_file_path, 'r') as f:
        for line in f:
            test_data.append(json.loads(line))

    # Extract features from test set
    print("\nExtracting features from test data...")
    test_df = feature_pipeline.extract_features(test_data, show_progress=True)

    X_test = test_df.drop(['battle_id'], axis=1, errors='ignore')

    # Predict on test set
    predictions = trainer.predict(X_test)

    submission = pd.DataFrame({
        'battle_id': test_df['battle_id'],
        'player_won': predictions
    })
    submission.to_csv('predict_csv/predictions_gridCV2.csv', index=False)


feature_pipeline = FeaturePipeline(selected_features)

train_file_path = '/kaggle/input/fds-pokemon-battles-prediction-2025/train.jsonl'
test_file_path = '/kaggle/input/fds-pokemon-battles-prediction-2025/test.jsonl'
train_out_path="predict_csv/train_features_extracted.csv"

print("Loading training data...")
train_data = []
with open(train_file_path, 'r') as f:
    for line in f:
        train_data.append(json.loads(line))

# Extract the features for train_set
print("\nExtracting features from training data...")
train_df = feature_pipeline.extract_features(train_data)
print("\nTraining features preview:")
print(train_df.head())

# Save dataset in a CSV file
train_df.to_csv(train_out_path, index=False)

#---------------Model Training and Evaluation Code------------------------

# Remove row 4877 from the train dataset
train_df = train_df.drop(index=4877)
X_train = train_df.drop(['battle_id', 'player_won'], axis=1)
y_train = train_df['player_won']


X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Pipeline with scaler and model
print("\nCreating pipeline with RobustScaler and LogisticRegression...")
pipeline = Pipeline([
    ('scaler',RobustScaler()),
    ('classifier', LogisticRegression(random_state=42, max_iter=2000)),
])

#Grid Search for Logistic Regression 
param_grid={
     'classifier__C':[0.01,0.1,1,10,100],
     'classifier__penalty': ['l1','l2'],
     'classifier__solver':['liblinear','saga'],
     'classifier__max_iter':[1000,2000]
}

grid_logreg = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='roc_auc',
    # scoring='accuracy',
    n_jobs=-1,  
    cv=5,            # 5-fold cross-validation, more on this later
    refit=True,      # retrain the best model on the full training set
    return_train_score=True
)


trainer = ModelTrainer(grid_logreg)
trainer.train(X_tr, y_tr)
trainer.evaluate(X_val, y_val)

print("Best CV score:", grid_logreg.best_score_)
print("Best params:", grid_logreg.best_params_)


# ------------------ Evaluate on Test Set -----------------

evaluate_test_set(trainer, selected_features, test_file_path)

# Best params: {'classifier__C': 1, 'classifier__max_iter': 2000, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
