In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sample_submission = pd.read_csv("../input/petfinder-pawpularity-score/sample_submission.csv")
sample_submission.head()

In [None]:
train_df = pd.read_csv("../input/petfinder-pawpularity-score/train.csv")
train_df.head()

In [None]:
test_df = pd.read_csv("../input/petfinder-pawpularity-score/test.csv")
test_df.head()

In [None]:
def kfold_split(df:pd.DataFrame, n_splits:int, shuffle:bool, random_state:int):
    
    from sklearn.model_selection import StratifiedKFold
    
    df['kfold'] = -1
    df = df.sample(frac=1).reset_index(drop=True)
    
    kf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    
    for fold, (train_idx,valid_idx) in enumerate(kf.split(X=df,y=df.Pawpularity.values)):
        
        df.loc[valid_idx,'kfold'] = fold
        
    return df 

In [None]:
train_df = kfold_split(df=train_df, n_splits=5, shuffle=True, random_state=42)
train_df.head()

In [None]:
import optuna
import catboost as ctb
from xgboost import XGBRegressor
import lightgbm as lgb

from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

def objective(trial,model,df):
    
    x_train = df[df.kfold != 0].reset_index(drop=True)
    x_test  = df[df.kfold  == 0].reset_index(drop=True)


    y_train = x_train.Pawpularity
    y_test = x_test.Pawpularity

    x_train = x_train.drop(columns=['Id','Pawpularity','kfold'])
    x_test  = x_test.drop(columns=['Id','Pawpularity','kfold'])
    
    if model == "catboost":
      
        param = {

            'iterations': trial.suggest_int("iterations",100,2000),
            'learning_rate': trial.suggest_float("learning_rate",1e-2,0.25, log=True),
            'subsample': trial.suggest_float("subsample",0.1,1.0),
            'depth':trial.suggest_int("depth",5,9),
            'bagging_temperature':trial.suggest_int("bagging_temperature",0,5)

        }

        catboost_regressor = ctb.CatBoostRegressor(**param,random_state = 42,verbose=0)

        catboost_regressor.fit(x_train,y_train,eval_set=[(x_test,y_test)])

        preds = catboost_regressor.predict(x_test)
        rmse = metrics.mean_squared_error(y_test,preds)

        return rmse
    
    if model == "xgboost":
        
        param = {

            'reg_lambda': trial.suggest_loguniform("reg_lambda",1e-8,100.0),
            'reg_alpha': trial.suggest_loguniform("reg_alpha", 1e-8,100.0),
            'learning_rate': trial.suggest_float("learning_rate",1e-2,0.25, log=True),
            'subsample': trial.suggest_float("subsample",0.1,1.0),
            'max_depth':trial.suggest_int("max_depth",1,7),
            'colsample_bytree':trial.suggest_float("colsample_bytree",0.1,1.0)

        }
        
        xgboost_regressor = XGBRegressor(**param,random_state=42,tree_method="gpu_hist",gpu_id=1,predictor="gpu_predictor")
        xgboost_regressor.fit(x_train,y_train,eval_set=[(x_test,y_test)])
        print("buradasın")
        preds = xgboost_regressor.predict(x_test)
        rmse  = metrics.mean_squared_error(y_test,preds)
        
        return rmse
    
    
    if model =="lightgbm":

        param = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'rmse'},
            'n_estimators': trial.suggest_int("n_estimators", 64, 8192),
            'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.25, log=True),
            'num_leaves': trial.suggest_int("num_leaves", 4, 16),
            'max_depth': trial.suggest_int("max_depth", 4, 16),
            'feature_fraction': trial.suggest_float("feature_fraction", 0.1, 1.0),
            'lambda_l1': trial.suggest_loguniform("lambda_l1", 1e-8, 100.0),
            'lambda_l2': trial.suggest_loguniform("lambda_l2", 1e-8, 100.0),
            'seed': 42,
            'deterministic': True,
            'verbose':-1,
        }

        lgb_train = lgb.Dataset(x_train,y_train)
        lgb_val   = lgb.Dataset(x_test,y_test)

        model=lgb.train(
            param,
            lgb_train,
            num_boost_round=5000,
            valid_sets=(lgb_train, lgb_val),
            early_stopping_rounds=100,
            verbose_eval=False
            
        )
        
        preds = model.predict(x_test)
        rmse = metrics.mean_squared_error(y_test,preds)
        
        return rmse   

In [None]:
from functools import partial


def optimize(model,n_trials):

    optimizer_func = partial(objective,model=model,df=train_df)

    study = optuna.create_study(direction="minimize")
    study.optimize(optimizer_func,n_trials=n_trials)

    return study.best_params

In [None]:
lgb_best_params = optimize(model="lightgbm",n_trials=100)

In [None]:
catboost_best_params = optimize(model="catboost",n_trials=100)

In [None]:
xgboost_best_params = optimize(model="xgboost",n_trials=30)

In [None]:
def fit_fold_with_best_params(df,test_df,fold,model,params):
    
    fold_predictions = []
    
    for fold_idx in range(fold):

        x_train = df[df.kfold != fold_idx].reset_index(drop=True)
        x_valid  = df[df.kfold  == fold_idx].reset_index(drop=True)


        y_train = x_train.Pawpularity
        y_valid = x_valid.Pawpularity

        x_train = x_train.drop(columns=['Id','Pawpularity','kfold'])
        x_valid  = x_valid.drop(columns=['Id','Pawpularity','kfold'])


        x_test = test_df.drop(columns=['Id'])

        if model == "catboost":

            catboost_regressor = ctb.CatBoostRegressor(**params,random_state = 42,verbose=0)
            catboost_regressor.fit(x_train,y_train,eval_set=[(x_valid,y_valid)])

            valid_preds = catboost_regressor.predict(x_valid)
            rmse = np.sqrt(metrics.mean_squared_error(y_valid,valid_preds))

            print(f"RMSE of {model} for the fold {fold_idx} : {rmse}")

            test_preds = catboost_regressor.predict(x_test)

            fold_predictions.append(test_preds)
        
        elif model =="xgboost":
            
            xgboost_regressor = XGBRegressor(**params,random_state=42,tree_method="gpu_hist",gpu_id=1,predictor="gpu_predictor")
            xgboost_regressor.fit(x_train,y_train,eval_set=[(x_valid,y_valid)])
            
            valid_preds = xgboost_regressor.predict(x_valid)
            rmse = np.sqrt(metrics.mean_squared_error(y_valid,valid_preds))

            print(f"RMSE of {model} for the fold {fold_idx} : {rmse}")
            
            test_preds = xgboost_regressor.predict(x_test)
            
            fold_predictions.append(test_preds)
        
        elif model=="lightgbm":
            
            lgb_train = lgb.Dataset(x_train,y_train)
            lgb_val   = lgb.Dataset(x_valid,y_valid)

            model=lgb.train(
                params,
                lgb_train,
                num_boost_round=5000,
                verbose_eval=False

            )
            
            valid_preds = model.predict(x_valid)
            rmse = np.sqrt(metrics.mean_squared_error(y_valid,valid_preds))

            print(f"RMSE of {model} for the fold {fold_idx} : {rmse}")
            
            test_preds = model.predict(x_test)
            fold_predictions.append(test_preds)
        
    final_predictions = np.mean(np.column_stack(fold_predictions), axis=1)
       
    return final_predictions

In [None]:
model_predictions = []

for model,param in zip(['catboost','xgboost','lightgbm'],[catboost_best_params,xgboost_best_params,lgb_best_params]):
    
    preds =  fit_fold_with_best_params(df=train_df,test_df=test_df,fold=5,model=model,params=param)
    
    model_predictions.append(preds)   

In [None]:
last = np.mean(np.column_stack(model_predictions), axis=1)
 

In [None]:
ids = test_df['Id'].values

submission = pd.DataFrame({'Id':ids,
                            'Pawpularity':last})

In [None]:
submission.to_csv("submission.csv",index=False)