# Import Library

In [None]:
import numpy as np
import pandas as pd 
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.impute import SimpleImputer

# Load Data

In [None]:
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
sample = pd.read_csv("/kaggle/input/spaceship-titanic/sample_submission.csv")

In [None]:
train_test = pd.concat([train, test], ignore_index=True)

In [None]:
train_test.head(3)

# Data Analysis and Feature Engineering

* PassengerId

In [None]:
train_test['Group'] = train_test['PassengerId'].astype(str).str[:4]

In [None]:
counts = train_test['Group'].value_counts()
train_test['Group'] = train_test['Group'].map(counts)

In [None]:
train_test.groupby('Group')['Transported'].mean()

There seem to be differences between the groups. This can help predict the target variable.

* Cabin

In [None]:
train_test[['Deck', 'Number', 'Side']] = train_test['Cabin'].str.split('/', expand=True)

In [None]:
train_test.head(3)

*   RoomService , FoodCourt , ShoppingMall , Spa, VRDeck

In [None]:
Expenses_columns = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']


expenses_grouped = train.groupby('Transported')[Expenses_columns].mean()

plt.figure(figsize=(12, 6))
expenses_grouped.plot(kind='bar')
plt.xlabel('Transported')
plt.ylabel('Average Expenses')
plt.title('Average Expenses by Transported')
plt.legend(Expenses_columns)
plt.show()

ShoppingMall, It doesn't look like a useful property for the target variable.

Lets create a new feature from 'RoomService','FoodCourt','Spa' and 'VRDeck' features.

In [None]:
Expenses_columns = ['RoomService','FoodCourt','Spa','VRDeck','ShoppingMall']
train_test['Expenses'] = train_test.loc[:,Expenses_columns].sum(axis=1)

If "Cryosleep" = 1, Expense = 0

In [None]:
train_test.loc[:,['CryoSleep']]=train_test.apply(lambda x: True if x.Expenses == 0 and pd.isna(x.CryoSleep) else x,axis =1)

In [None]:
expenses_grouped = train_test.groupby('Transported')["Expenses"].mean()
plt.figure(figsize=(8, 5))
expenses_grouped.plot(kind='bar')

There is a difference between " transported " groups according to the "Expenses".

* HomePlanet

In [None]:
train_test.groupby('HomePlanet')['Transported'].mean()

* Destination

In [None]:
train_test.groupby('Destination')['Transported'].mean()

* CryoSleep

In [None]:
train_test.groupby('CryoSleep')['Transported'].mean()

* Age

In [None]:
train_test.head(3)

In [None]:
num_cols = ['ShoppingMall','FoodCourt','RoomService','Spa','VRDeck','Expenses','Age']
cat_cols = ['CryoSleep','Deck','Side','VIP','HomePlanet','Destination',"Group"]
transported=['Transported']

In [None]:
train_test = train_test[num_cols+cat_cols+transported].copy()

In [None]:
train_test.head()

# Missing Values

In [None]:
num_imp = SimpleImputer(strategy='mean')
cat_imp = SimpleImputer(strategy='most_frequent')

In [None]:
train_test[num_cols] = pd.DataFrame(num_imp.fit_transform(train_test[num_cols]),columns=num_cols)
train_test[cat_cols] = pd.DataFrame(cat_imp.fit_transform(train_test[cat_cols]),columns=cat_cols)

* Age

In [None]:
sns.histplot(data=train, x='Age', hue='Transported', element='step', kde=True,binwidth=8)
plt.title('Histogram of Age by Survived')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

In [None]:
# Age değişkenini gruplayalım
train_test['Age_group'] = 0
train_test.loc[train_test['Age']  <= 5, 'Age_group'] = 1
train_test.loc[(train_test['Age'] > 5) & (train_test['Age'] <= 10), 'Age_group']=2
train_test.loc[(train_test['Age'] > 10) & (train_test['Age'] <= 20), 'Age_group']=3
train_test.loc[(train_test['Age'] > 20) & (train_test['Age'] <= 30), 'Age_group']=4
train_test.loc[(train_test['Age'] > 30) & (train_test['Age'] <= 50), 'Age_group']=5
train_test.loc[(train_test['Age'] > 50) & (train_test['Age'] <= 60), 'Age_group']=6
train_test.loc[(train_test['Age'] > 60) & (train_test['Age'] <= 70), 'Age_group']=7
train_test.loc[train_test['Age']  > 70, 'Age_group'] = 8

In [None]:
train_test.groupby('Age_group')['Transported'].mean()

In [None]:
num_cols = ['ShoppingMall','FoodCourt','RoomService','Spa','VRDeck','Expenses','Age']
cat_cols = ['CryoSleep','Deck','Side','VIP','HomePlanet','Destination',"Group","Age_group"]
transported=['Transported']
train_test = train_test[num_cols+cat_cols+transported].copy()

## One-Hot Encoding :  Catagoric Features

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder (handle_unknown='ignore',sparse_output = False)
temp_train = pd.DataFrame(ohe.fit_transform(train_test[cat_cols]),columns=ohe.get_feature_names_out())
train_test = train_test.drop(cat_cols,axis=1)
train_test = pd.concat([train_test,temp_train],axis=1)

### Getting Test and Train Data

In [None]:
train = train_test[train_test['Transported'].notnull()].copy()
train.Transported =train.Transported.astype('int')
test = train_test[train_test['Transported'].isnull()].drop("Transported",axis=1)

In [None]:
X = train.drop('Transported',axis=1)
y = train.Transported

 ### Most corrolated features

In [None]:
def get_redundant_pairs(X):
    pairs_to_drop = set()
    cols = X.columns
    for i in range(0, X.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop
def get_top_abs_correlations(df, n=1):
    au_corr = X.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(X)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]
print("Top Absolute Correlations !")
print(get_top_abs_correlations(train.select_dtypes(include=['int32','int64']), 10))

* Deleting one of the variables with the highest correlation prevents multicorrelation.

    These Feature can delete:
    
    CryoSleep_True , VIP_False , Side_P , Destination_TRAPPIST-1e   , FoodCourt  ,HomePlanet_Earth
    
* We have seen before that 'ShoppingMall' will contribute to the model.

* We used the "Age" variable as  "Age_group" ,we will delete this too


In [None]:
drop_list=['ShoppingMall',"Age",'CryoSleep_True','HomePlanet_Earth',
'VIP_False','FoodCourt','Destination_TRAPPIST-1e', "Side_P"]

In [None]:
X=X.drop(drop_list,axis=1)
test=test.drop(drop_list,axis=1)

In [None]:
X.columns

# Selection Model

In [None]:
kfold = StratifiedKFold(n_splits=10)
# Modeling step Test differents algorithms 
random_state = 2
classifiers = [SVC(),RandomForestClassifier(), ExtraTreesClassifier(),GradientBoostingClassifier(),
    KNeighborsClassifier(), CatBoostClassifier(verbose = False),
    XGBClassifier(), LGBMClassifier()]

cv_results = []
for classifier in classifiers :
    cv_results.append(cross_val_score(classifier, X, y = y, scoring = "accuracy", cv = kfold, n_jobs=4))

cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

cv_res = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,"Algorithm":["SVC", "RandomForest","ExtraTrees","GradientBoosting"
                                                                                      ,"KNeighboors", "CatBoostClassifier",
                                                                                      "XGBClassifier", "LGBMClassifier"]})

In [None]:
cv_res 

I will select XGBClassifier and proceed.

# Feature Selection

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance

# Train the LightGBM model
model = lgb.LGBMClassifier()
model.fit(X, y)

# Get permutation materiality
result = permutation_importance(model, X_val, y_val, scoring="accuracy", n_repeats=100, random_state=42)
sorted_indices = np.argsort(result.importances_mean)[::-1]

# Show the most important features and their respective importance values
top_features = X.columns[sorted_indices[:15]]  # Most 15 importend feature
top_importances = result.importances_mean[sorted_indices[:15]]
for feature, importance in zip(top_features, top_importances):
    print(f"{feature}: {importance}")

In [None]:
top_features

# Model Tunnig with OPTUNA

In [None]:
# Let's move on with the top 20 features
X = X[top_features]
test = test[top_features]

In [None]:
#import necessary libraries
import optuna
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier



#define objective function for hyperparameter optimization using optuna
def objective(trial):

  #define hyperparameters to optimize for
  params = {
      'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
      'max_depth': trial.suggest_int('max_depth', 3, 10),
      'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 1),
      'subsample': trial.suggest_uniform('subsample', 0.1, 1),
      'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),
      #'gamma': trial.suggest_uniform('gamma', 0, 1),
      'alpha': trial.suggest_loguniform('alpha', 2, 5),
      'lambda': trial.suggest_loguniform('lambda', 2, 5),
      'min_child_weight': trial.suggest_int('min_child_weight', 1, 300)
  }

  #create XGBClassifier model with optimized hyperparameters
  model = XGBClassifier(**params, random_state=0)
    
  #evaluate model using cross-validation
  score = cross_val_score(model, X, y, cv=5).mean()
    
  return score

#run hyperparameter optimization with optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

In [None]:
#get best hyperparameters
best_params = study.best_params
print(f'Best hyperparameters: {best_params}')

I found the best parameters by running "n trials=500" on the local computer.It will take too much time to do this here.

As follows:

In [None]:
best_params= {'n_estimators': 465, 
              'max_depth': 4, 
              'learning_rate': 0.13566308331651933,
              'subsample': 0.6217729332313746,
              'colsample_bytree': 0.9406231273240503,
              'alpha': 3.8470299829756747,
              'lambda': 2.6061951769367186,
              'min_child_weight': 6}

In [None]:
#create XGBClassifier model with best hyperparameters
model = XGBClassifier(**best_params, random_state=0)

In [None]:
#fit and predict using model
model.fit(X, y)
predictions = model.predict(test)

In [None]:
sample['Transported'] = predictions
#This converts the numbers to True/False values
sample['Transported']=sample['Transported']>0.5
sample.to_csv('final.csv', index=False)

### Next

Hyperparameter optimization trials can be done for CatBoostClassifier and LGBMClassifier to achieve better results with high number of trials