In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb
from sklearn.metrics import roc_auc_score,accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")
import os

## Parameters

In [None]:
RAND_VAL=42
num_folds=7 ## Number of folds
n_est=12000 ## Number of estimators

## Read Source Files

In [None]:
df_train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
df_train['typ']=0
df_test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
df_test['typ']=1
df_all = pd.concat([df_train,df_test],axis=0).reset_index(drop=True)
print(len(df_train))
print(len(df_test))
print(len(df_all))
df_all.head()

## Feature Engg

In [None]:
def getFeats(df):

    ## Categorical Features
    df[['firstName','lastName']]=df['Name'].str.split(" ",expand=True)
    df[['cab0','cab1','cab2']]=df['Cabin'].str.split("/",expand=True)
    



    cat_cols = ['HomePlanet', 'Destination', 'lastName', 'cab0', 'cab2']
    for c in cat_cols:
        df[c]=df[c].fillna('Unknown')
        label_encoder = LabelEncoder()
        df[c] = label_encoder.fit_transform(df[c])
        
    ## Numerical Features

    numCols=['RoomService','FoodCourt', 'ShoppingMall','Spa','VRDeck','Age']
    for c in numCols:
        scaler = MinMaxScaler()
        df[c] = df[c].fillna(df[c].mean())
    df[numCols] = scaler.fit_transform(df[numCols])
    
    int_cols = ['CryoSleep', 'VIP', 'cab1']
    for c in int_cols:
        df[c]=df[c].fillna(0).astype('int')
        
    return df

In [None]:
df_all=getFeats(df_all)
df_all.head()
df_train = df_all[df_all.typ==0].reset_index(drop=True)
df_test = df_all[df_all.typ==1].reset_index(drop=True)
##
feat_cols=df_train.columns.drop(['PassengerId', 'Transported', 'typ','Cabin', 'Name', 'firstName'])
print(feat_cols)
df_train.head()

In [None]:
X=df_train[feat_cols]
y=df_train['Transported'].astype('int')
#
lgbParams = {
    'n_estimators': n_est,
    'learning_rate': 0.021,
    'random_state': RAND_VAL}

In [None]:
LGB=lgb.LGBMClassifier(**lgbParams)
LGB.fit(X,y)
lgb.plot_importance(LGB, importance_type="gain", figsize=(12,10), max_num_features=12,
                    title="LightGBM Feature Importance (Gain)")
plt.show()

## Training

In [None]:
folds = StratifiedKFold(n_splits=num_folds,random_state=RAND_VAL,shuffle=True)
test_preds = np.empty((num_folds, len(df_test)))
acc_vals=[]

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
    
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[valid_idx], y.iloc[valid_idx]
    
    LGB=lgb.LGBMClassifier(**lgbParams)
    LGB.fit(X_train, y_train,eval_metric='auc',
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=100,verbose=100)
    
    y_pred_val = LGB.predict(X_val[feat_cols])
    acc_val = accuracy_score(y_val, y_pred_val)
    print("Accuracy for fold ",n_fold,": ",acc_val)
    acc_vals.append(acc_val)
    
    y_pred_test = LGB.predict(df_test[feat_cols])
    test_preds[n_fold, :] = y_pred_test
    print("----------------")

In [None]:
"Mean Accuracy: ",np.mean(acc_vals)

## Submission

In [None]:
y_pred = test_preds.mean(axis=0)
df_sub=df_test[['PassengerId']]
df_sub['Transported']=y_pred>0.5
df_sub.head()

In [None]:
df_sub.to_csv("submission.csv",index=False)