# Exploring Models 3rd

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import re

import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping
from catboost import CatBoostRegressor, Pool

from sklearn.svm import SVR
from xgboost import XGBRegressor

import random

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import optuna

USE_OPTUNA = False

In [None]:
!python /kaggle/input/xgb-lgbm-cat-5/xgb_lgbm_cat.py

In [None]:
!python /kaggle/input/lgbm-cat-2/lgbm_cat.py

In [None]:
# original_data_path = '/kaggle/input/used-car-price-prediction-dataset/used_cars.csv'
# data_path = '/kaggle/input/playground-series-s4e9/'

# original = pd.read_csv(original_data_path)
# train = pd.read_csv(data_path + 'train.csv')
# test = pd.read_csv(data_path + 'test.csv')
# submission = pd.read_csv(data_path + 'sample_submission.csv')

## XGBoost

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')
Original = pd.read_csv('/kaggle/input/used-car-price-prediction-dataset/used_cars.csv')


Original[['milage', 'price']] = Original[['milage', 'price']].map(
    lambda x: int(''.join(re.findall(r'\d+', x))))

train = pd.concat([train, Original], ignore_index=True)

In [None]:
cat_cols = train.select_dtypes(include=['object', 'category']).columns.tolist()

In [None]:
cat_features = test.select_dtypes('object').columns
# Reference: https://www.kaggle.com/code/ambrosm/pss4e8-eda-which-makes-sense
for feature in cat_features:          
    categories = sorted(list(set(train[feature].dropna())))
    dtype = pd.CategoricalDtype(categories=categories, ordered=False)
    
    train.loc[~train[feature].isin(categories), feature] = np.nan
    test.loc[~test[feature].isin(categories), feature] = np.nan
    
    train[feature] = train[feature].astype(dtype)
    test[feature] = test[feature].astype(dtype)

In [None]:
X_train = train.drop(columns=["id", "price"])
y_train = train["price"]

X_test = test.drop(columns=["id"])

In [None]:
def cross_validate_model_x(model, X_train, y_train, X_test, params, n_splits=5):
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    val_scores = []
    test_preds = np.zeros((len(X_test), n_splits), dtype=np.float32)

    # Cross-validation loop
    for fold, (train_ind, valid_ind) in enumerate(cv.split(X_train)):
        X_fold_train = X_train.iloc[train_ind]
        y_fold_train = y_train.iloc[train_ind]
        X_val = X_train.iloc[valid_ind]
        y_val = y_train.iloc[valid_ind]

        # Model initialization and training
        clf = model(**params, enable_categorical=True)
        clf.fit(X_fold_train, y_fold_train, eval_set=[(X_val, y_val)], verbose=0, early_stopping_rounds=50)

        # Predict and evaluate
        val_pred = clf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, val_pred))
        val_scores.append(rmse)
        test_preds[:, fold] = clf.predict(X_test)

    test_preds = np.mean(test_preds, axis=1)
    return clf, test_preds, np.mean(val_scores)  # Returning the mean RMSE

In [None]:
xgb_params={
    
'lambda': 0.17694956261235095,
 'alpha': 1.3889763984339085,
 'colsample_bytree': 0.7,
 'subsample': 1.0,
 'learning_rate': 0.008,
 'max_depth': 17,
 'random_state': 2020,
 'min_child_weight': 59,
 'n_estimators': 10000,
 'tree_method': 'gpu_hist'
 
}


print('XGBoost Cross-Validation Results:\n')
xgb_model, test_predsx, avg_rmse_xgb = cross_validate_model_x(XGBRegressor, X_train, y_train, X_test, xgb_params)

In [None]:
avg_rmse_xgb

In [None]:
xgb_result =  pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')
xgb_result['price'] = test_predsx.astype(np.float32)
xgb_result

## LGBM

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')
Original = pd.read_csv('/kaggle/input/used-car-price-prediction-dataset/used_cars.csv')


Original[['milage', 'price']] = Original[['milage', 'price']].map(
    lambda x: int(''.join(re.findall(r'\d+', x))))

train = pd.concat([train, Original], ignore_index=True)

In [None]:
def update(df):
    
    t = 100
    
    cat_c = ['brand','model','fuel_type','engine','transmission','ext_col','int_col','accident','clean_title']
    re_ = ['model','engine','transmission','ext_col','int_col']
    
    for col in re_:
        df.loc[df[col].value_counts(dropna=False)[df[col]].values < t, col] = "noise"
        
    for col in cat_c:
        df[col] = df[col].fillna('missing')
        df[col] = df[col].astype('category')
        
    return df

train  = update(train)
test   = update(test)

In [None]:
X_train = train.drop(columns=["id", "price"])
y_train = train["price"]

X_test = test.drop(columns=["id"])

In [None]:
def cross_validate_model_l(model, X_train, y_train, X_test, params, n_splits=10):
    callbacks = [log_evaluation(period=150), early_stopping(stopping_rounds=200)]
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    val_scores = []
    test_preds = np.zeros((len(X_test), n_splits), dtype=np.float32)

    for fold, (train_ind, valid_ind) in enumerate(cv.split(X_train)):
        X_fold_train = X_train.iloc[train_ind]
        y_fold_train = y_train.iloc[train_ind]
        X_val = X_train.iloc[valid_ind]
        y_val = y_train.iloc[valid_ind]

        clf = model(**params)
        clf.fit(X_fold_train, y_fold_train, eval_set=[(X_val, y_val)], callbacks=callbacks)

        val_pred = clf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, val_pred))
        val_scores.append(rmse)
        test_preds[:, fold] = clf.predict(X_test)

    test_preds = np.mean(test_preds, axis=1)
    return clf, test_preds, np.mean(val_scores)

In [None]:
from lightgbm import LGBMRegressor
# params are taken from this notebook: https://www.kaggle.com/code/noodl35/optuna-lgbm-tuning-used-cars
lgb_params = {
                'num_leaves': 426,
                 'max_depth': 20,
                 'learning_rate': 0.011353178352988012,
                 'n_estimators': 10000,
                 'metric': 'rmse',
                 'subsample': 0.5772552201954328,
                 'colsample_bytree': 0.9164865430101521,
                 'reg_alpha': 1.48699088003429e-06,
                 'reg_lambda': 0.41539458543414265,
                 'min_data_in_leaf': 73,
                 'feature_fraction': 0.751673655170548,
                 'bagging_fraction': 0.5120415391590843,
                 'bagging_freq': 2,
                 'random_state': 42,
                 'min_child_weight': 0.017236362383443497,
                 'cat_smooth': 54.81317407769262,
                 'verbose' : -1             # Set to -1 for silent mode, no process information printed
}

print('LightGBM Cross-Validation Results:\n')
lgb_model,test_predsl, avg_rmse_lgb= cross_validate_model_l(LGBMRegressor, X_train, y_train, X_test, lgb_params)

In [None]:
avg_rmse_lgb

In [None]:
lgb_result =  pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')
lgb_result['price'] = test_predsl.astype(np.float32)
lgb_result

## CatBoost

In [None]:
def cross_validate_model_c(model, X_train, y_train, X_test, params, n_splits=10):
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    val_scores = []
    test_preds = np.zeros((len(X_test), n_splits), dtype=np.float32)

    for fold, (train_ind, valid_ind) in enumerate(cv.split(X_train)):
        X_fold_train = X_train.iloc[train_ind]
        y_fold_train = y_train.iloc[train_ind]
        X_val = X_train.iloc[valid_ind]
        y_val = y_train.iloc[valid_ind]

        clf = model(**params)
        clf.fit(X_fold_train, y_fold_train, eval_set=[(X_val, y_val)], verbose=0)

        val_pred = clf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, val_pred))
        val_scores.append(rmse)
        test_preds[:, fold] = clf.predict(X_test)

    test_preds = np.mean(test_preds, axis=1)
    return clf, test_preds, np.mean(val_scores)

In [None]:
from catboost import CatBoostClassifier

cat_params = {
    'cat_features':cat_cols,  
    'learning_rate': 0.075,
    'iterations': 5000,
    'depth': 9,
    'random_strength': 0,
    'l2_leaf_reg': 0.5,
    'task_type': 'GPU',
    'max_leaves': 512,
    'fold_permutation_block': 64,
    'random_seed': 42,
    'verbose': False                      
}

print('CatBoost Cross-Validation Results:\n')
cat_model, test_predsc, avg_rmse_cat= cross_validate_model_c(CatBoostRegressor, X_train, y_train, X_test, cat_params)

In [None]:
avg_rmse_cat

In [None]:
cat_result =  pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')
cat_result['price'] = test_predsc.astype(np.float32)
cat_result

- avg_rmse_xgb: 72609.76512922066
- avg_rmse_lgb: 72342.5950606447
- avg_rmse_cat: 72590.30626530075

In [None]:
Pred = pd.concat([lgb_result,cat_result,xgb_result], axis=1)['price']
print(Pred)
test_preds = test_predsl * 0.7 + test_predsc * 0.1 + test_predsx*0.2

In [None]:
ensemble_sub =  pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')
ensemble_sub['price'] = test_preds

ensemble_sub.to_csv('submission_ensemble.csv', index=False)

## Top 5

In [None]:
def read_csv(number: int) -> pd.DataFrame:
    file = pd.read_csv(f'/kaggle/input/top-4-car-prices-used/{number}.csv')  # Use an f-string for cleaner formatting
    return file

csv0 = read_csv(72016)
csv1 = read_csv(72018)
csv2 = read_csv(72020)
csv3 = read_csv(72021)
csv4 = read_csv(72025)

In [None]:
csv = csv0
csv.rename(columns={'price':'72016'}, inplace=True)
csv['72018'] = csv1['price']
csv['72020'] = csv2['price']
csv['72021'] = csv3['price']
csv['72025'] = csv4['price']

csv.head(10).round(0)

In [None]:
csv['price'] = 0.999*csv['72016'] + 0.1*csv['72018'] + 0.0001*csv['72020']+ 0.0001*csv['72021']+ 0.0001*csv['72025']
csv

In [None]:
top5_sub = pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')
top5_sub['price'] = csv['price']
top5_sub.to_csv('./submission_top5.csv', index=False)
top5_sub.head()

In [None]:
final_sub = pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')
final_sub['price'] = (ensemble_sub['price']) * 0.4 + 0.6 * top5_sub['price']
final_sub.to_csv("submission.csv", index=False)
final_sub.head()

In [64]:
final2_sub = pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')
ensemble2_sub = pd.read_csv('/kaggle/working/submission_ensemble.csv')
top52_sub = pd.read_csv('/kaggle/working/submission_top5.csv')

final2_sub['price'] = (ensemble2_sub['price']) * 0.3 + 0.7 * top52_sub['price']
final2_sub.to_csv("submission_final2.csv", index=False)
final2_sub.head()

Unnamed: 0,id,price
0,188533,18700.530431
1,188534,84286.923762
2,188535,58978.337041
3,188536,31631.433046
4,188537,31792.025625
