In [14]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, PredefinedSplit, KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
from sklearn.impute import KNNImputer
from sklearn.metrics import classification_report, roc_auc_score
from lightgbm import LGBMClassifier

In [20]:
def KNN_Imputer(df_train, df_test):
    df_tr_imp = df_train.copy()
    df_ts_imp = df_test.copy()
    
    encoders = {}

    for df in [df_tr_imp, df_ts_imp]:
        df[['CabinDeck', 'CabinNum', 'CabinSide']] = df['Cabin'].str.split('/', expand=True)
        df['CabinNum'] = pd.to_numeric(df['CabinNum'], errors='coerce')
        df[['GroupId', 'PersId']] = df['PassengerId'].str.split('_', expand=True)

    numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CabinNum']
    categorical_features = ['HomePlanet', 'Destination', 'CryoSleep', 'VIP', 'CabinDeck', 'CabinSide']
    
    df_tr_imp.drop(['Name', 'Cabin', 'PassengerId', 'Transported'], axis=1, inplace=True)
    df_ts_imp.drop(['Name', 'Cabin', 'PassengerId'], axis=1, inplace=True)

    # Encode categorical variables, replacing NaN with -1 before encoding
    for feature in categorical_features:
        encoder = LabelEncoder()
        
        # Replace NaNs with -1 temporarily (LightGBM & KNNImputer work better this way)
        df_tr_imp[feature] = df_tr_imp[feature].astype(str).replace('nan', np.nan).fillna('-1')
        df_ts_imp[feature] = df_ts_imp[feature].astype(str).replace('nan', np.nan).fillna('-1')

        # Fit encoder only on training data and transform both sets
        df_tr_imp[feature] = encoder.fit_transform(df_tr_imp[feature])  
        df_ts_imp[feature] = encoder.transform(df_ts_imp[feature])
        
        encoders[feature] = encoder  

    num_imputer = KNNImputer(n_neighbors=10, weights='uniform', metric='nan_euclidean')
    cat_imputer = KNNImputer(n_neighbors=10, weights='uniform', metric='nan_euclidean')

    df_tr_imp[numerical_features] = num_imputer.fit_transform(df_tr_imp[numerical_features])
    df_ts_imp[numerical_features] = num_imputer.transform(df_ts_imp[numerical_features])

    df_tr_imp[categorical_features] = cat_imputer.fit_transform(df_tr_imp[categorical_features])
    df_ts_imp[categorical_features] = cat_imputer.transform(df_ts_imp[categorical_features])

    # Convert categorical values back to integers
    for feature in categorical_features:
        df_tr_imp[feature] = df_tr_imp[feature].round().astype(int)
        df_ts_imp[feature] = df_ts_imp[feature].round().astype(int)

        # Ensure values are within valid range before inverse_transform()
        max_label_value = len(encoders[feature].classes_) - 1
        df_tr_imp[feature] = df_tr_imp[feature].clip(0, max_label_value)
        df_ts_imp[feature] = df_ts_imp[feature].clip(0, max_label_value)

        # Convert back to original category names
        df_tr_imp[feature] = encoders[feature].inverse_transform(df_tr_imp[feature])
        df_ts_imp[feature] = encoders[feature].inverse_transform(df_ts_imp[feature])

    return df_tr_imp, df_ts_imp

In [55]:
def make_submission(df_test,df_test_raw, model):
    y_hat_test = model.predict(df_test)
    df_final = pd.DataFrame({
        df_test_raw.columns[0]: df_test_raw.iloc[:, 0],  
        "Transported": y_hat_test.astype(bool)   
    })
    return df_final

In [22]:
train_raw, test_raw = pd.read_csv('train.csv'), pd.read_csv('test.csv')
train_imp, test_imp = KNN_Imputer(train_raw, test_raw)

In [77]:
param_grid_lgb = {
    'n_estimators': [100, 300, 500, 700, 1000],     
    'learning_rate': [0.01, 0.05, 0.1],      
    'max_depth': [-1, 6, 8, 10],               
    'num_leaves': [20, 31, 50, 100],            
    'min_child_samples': [10, 20, 50],      
    'colsample_bytree': [0.7, 0.9, 1.0],     
    'subsample': [0.7, 0.9, 1.0],            
    'reg_alpha': [0, 0.1, 0.5],             
    'reg_lambda': [0, 0.1, 0.5],            
}


### LGBMClassifier

In [16]:
kf = KFold(n_splits=5, shuffle=True, random_state=42) 
model_lgb = LGBMClassifier(boosting_type='gbdt', random_state=42)

In [72]:
lgb_cv = RandomizedSearchCV(
    estimator=model_lgb,
    param_distributions=param_grid_lgb,
    n_iter=20,  
    cv=kf,
    verbose=2,
    random_state=42,
    n_jobs=-1 
)

In [42]:
numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
categorical_features = ['HomePlanet', 'Destination', 'CryoSleep', 'CabinDeck', 'CabinSide']

In [23]:
train_imp_lgb = train_imp.copy()
test_imp_lgb = test_imp.copy()
train_imp_lgb[categorical_features] = train_imp_lgb[categorical_features].astype('category')
test_imp_lgb[categorical_features] = test_imp_lgb[categorical_features].astype('category')

In [48]:
train_imp_lgb['GroupId'] = train_imp_lgb['GroupId'].astype('float64')
test_imp_lgb['GroupId'] = test_imp_lgb['GroupId'].astype('float64')
X_lgb = train_imp_lgb.drop(['VIP', 'PersId'], axis=1)
test_lgb = test_imp_lgb.drop(['VIP', 'PersId'], axis=1)
y_lgb = train_raw['Transported']
y_lgb = y_lgb.astype('category')

In [49]:
X_norm_lgb = X_lgb.copy()
X_norm_lgb[numerical_features] = preprocessing.StandardScaler().fit(X_norm_lgb[numerical_features]).transform(X_norm_lgb[numerical_features])
test_norm_lgb = test_lgb.copy()
test_norm_lgb[numerical_features] = preprocessing.StandardScaler().fit(test_norm_lgb[numerical_features]).transform(test_norm_lgb[numerical_features])

In [66]:
X_train_lgb, X_test_lgb, y_train_lgb, y_test_lgb = train_test_split(X_norm_lgb, y_lgb, test_size=0.15, random_state=42)

In [73]:
lgb_cv.fit(X_train_lgb, y_train_lgb, categorical_feature=categorical_features)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[LightGBM] [Info] Number of positive: 3731, number of negative: 3658
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000354 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1922
[LightGBM] [Info] Number of data points in the train set: 7389, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504940 -> initscore=0.019760
[LightGBM] [Info] Start training from score 0.019760


In [74]:
print("tuned hyperparameters :(best parameters) ",lgb_cv.best_params_)
print("accuracy :",lgb_cv.best_score_)

tuned hyperparameters :(best parameters)  {'subsample': 0.9, 'reg_lambda': 0, 'reg_alpha': 0, 'num_leaves': 20, 'n_estimators': 700, 'min_child_samples': 50, 'max_depth': 10, 'learning_rate': 0.01, 'colsample_bytree': 0.7}
accuracy : 0.8110707895443255


In [75]:
pred_subm_lgb = make_submission(test_norm_lgb, test_raw, lgb_cv)

In [76]:
pred_subm_lgb.to_csv("prediction-03.csv", index=False)

### This model has 0.81155 score on Kaggle, 23rd place on Leaderboard (31.03.2025)