# 1.0 IMPORTS



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.3f' % x)
plt.rcParams["figure.figsize"] = (10,6)

# 2.0 READ DATASET

In [None]:
train_data0 = pd.read_csv("spaceship_train.csv")
train_data0['Transported'] = train_data0['Transported'].astype(int)

In [None]:
train_data0.head(5)

In [None]:
test_data = pd.read_csv("spaceship_test.csv")

In [None]:
test_data.head()

# EDA

In [None]:
train_data = train_data0.copy() #to save the first version of my train_data

In [None]:
df = pd.concat([train_data, test_data], axis=0)
df.head(3)

#We will do EDA part

In [None]:
def summary(df):
    print(f"Dataset has {df.shape[1]} features and {df.shape[0]} examples.")
    summary = pd.DataFrame(index=df.columns)
    summary["Unique"] = df.nunique().values
    summary["Missing"] = df.isnull().sum().values
    summary["Duplicated"] = df.duplicated().sum()
    summary["Types"] = df.dtypes
    return summary

summary(df)

 # FEATURE ENGİNEERİNG

### Group size - PassengerId
A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

-So we will create a group_size feature  using PassengerID feature

In [None]:
group = df['PassengerId'].apply(lambda x: x.split('_')[0]).value_counts().to_dict()

In [None]:
df['Group_size'] = df['PassengerId'].apply(lambda x: group[x.split('_')[0]])

In [None]:
df.set_index('PassengerId', inplace=True)

In [None]:
df["Group_size"].head(5)

### HomePlanet
The planet the passenger departed from, typically their planet of permanent residence.

In [None]:
df['HomePlanet'].value_counts(normalize=True)

In [None]:
tmp = df['HomePlanet'].value_counts()
tmp 

In [None]:
# creating probability distribution for each planet
v = tmp.index # ['Earth', 'Europa', 'Mars']

p = tmp.values 
p = p/sum(p)
p

In [None]:
df.loc[df['HomePlanet'].isna(), 'HomePlanet'] = np.random.choice(v, df['HomePlanet'].isna().sum(), p=p)

In [None]:
df['HomePlanet'].isnull().sum().sum()

In [None]:
#If we use methods like mod,median,nearest neighboor etc we may cause bias which will misslead the model.
#In order to increase our model's generalization ability we randomly filled the null values


# CRYOSLEEP
In here we assume that nan values are 0 because if they are in cryosleep they should have been written as 1

In [None]:
df['CryoSleep'].fillna(df['CryoSleep'].median(), inplace=True)
df['CryoSleep'] = df['CryoSleep'].astype(int)

In [None]:
df['CryoSleep'].isnull().sum().sum()

# Cabin
The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

In [None]:
df['Cabin'].head(4)

In [None]:
#As you can see we have a lot of information in this feature, we will extract some features from cabin and also fill the missing values
tmp = df['Cabin'].apply(lambda x: x.split('/') if type(x) != float else ['-1', '-1', '-1']).to_list()
tmp = np.array(tmp)

In [None]:
df['Cabin_deck'] = tmp[:, 0]
df['Cabin_num'] = tmp[:, 1]
df['Cabin_side'] = tmp[:, 2]
df.drop(columns='Cabin', inplace=True)

# CABİN_DECK

In [None]:
df.loc[df['Cabin_deck']=='-1', 'Cabin_deck'] = np.random.choice(['F', 'G'], sum(df['Cabin_deck']=='-1'), 
                                                              p=[0.5, 0.5])
#This maintains the original distribution of cabin_deck
#We use F and G because they are the top 2 most used cabins again this maintains the original dist.

In [None]:
df['Cabin_deck'].value_counts()

In [None]:
df['Cabin_deck'].value_counts(normalize=True)

# Cabin_number

In [None]:
df['Cabin_num'].nunique()


In [None]:
#We have lots of unique numbers, this can help our model

In [None]:
df['Cabin_num'] = df['Cabin_num'].astype(int)
df.loc[df['Cabin_num']=='-1', 'Cabin_num'] = int(df['Cabin_num'].mean())
#Fill missing values with mean, because we don't have too much missing values and we do not want to misslead the model

# Cabin_side

In [None]:
#Same as cabin deck we want to avoid bias in order to get more higher test scores

In [None]:
df.loc[df['Cabin_side']=='-1', 'Cabin_side'] = np.random.choice(['S', 'P'], sum(df['Cabin_side']=='-1'), 
                                                              p=[0.5, 0.5])
df['Cabin_side'] = df['Cabin_side'].map({'S':0, 'P':1})
df['Cabin_side'].value_counts()

# Destination
to avoid introducing a bias towards a specific planet again we will handle the missing values by randomly assigning of three planets

In [None]:
df['Destination'].value_counts()

In [None]:
df.loc[df['Destination'].isna(), 'Destination'] = np.random.choice(['TRAPPIST-1e', '55 Cancri e', 'PSO J318.5-22'], 
                                                                  sum(df['Destination'].isna()), 
                                                                  p=[0.5, 0.3, 0.2])

# AGE

In [None]:
summary(df)[3:4]

In [None]:
#We have 270 missing values, for age we want to maintain the  overall distribution of feature in order not to biased for a spesific age

In [None]:
mean_age = df["Age"].mean()
std_age = df["Age"].std()
is_null = df["Age"].isnull().sum()
rand_sample = np.random.uniform(mean_age - std_age, mean_age + std_age, size = is_null)
df.loc[df['Age'].isna(), 'Age'] = rand_sample

# VIP

In [None]:
df['VIP'].value_counts()


In [None]:
#Same as cryosleep we assume that missing values are 0 because if they pay for vip they should have written as vip
df['VIP'].fillna(False, inplace=True)
df['VIP'] = df['VIP'].astype(int) #convert to 0-1 instead true false 

# RoomService, FoodCourt, ShoppingMall, Spa, VRDeck

In [None]:
#For missing values in each column we will fill them with median

#This strategy is more robust to outliers and skewed data than filling with the mean !!!

In [None]:
cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for col in cols:
    df[col].fillna(df[col].median(), inplace=True)

In [None]:
#Let's create a features of expends

In [None]:
df['total_spending'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] +\
df['Spa'] + df['VRDeck']

In [None]:
cols.append('total_spending')

In [None]:
custom_colors = [
    (100/255, 108/255, 116/255),   # nevada
    (228/255, 12/255, 33/255),     # red-ribbon
    (68/255, 68/255, 76/255),      # abbey
    (172/255, 28/255, 44/255),     # roof-terracotta 
]
custom_palette = sns.color_palette(custom_colors)

In [None]:
fig, axes = plt.subplots(len(cols),2, figsize=(12,14))
for i, col in enumerate(cols):
    sns.histplot(data=df, x=col, ax=axes[i, 0], bins=20, color=custom_colors[0])
    sns.histplot(data=np.log(df[[col]]), x=col, ax=axes[i, 1], color=custom_colors[1])
    axes[i, 0].set_title('Normal Distribution')
    axes[i, 1].set_title('Logarithmic Distribution')
plt.tight_layout()

In [None]:
#When we use logarithmic transformation we must pay attention to values which are 0 and negative.Logarithm is not defined at these values

In [None]:
for col in cols:
    df.loc[df[col]==0, col] = 0.367 # is approximately -1
    df[col] = np.log(df[col])
#value for log0 is not defined so we used 0.367

# NAME

In [None]:
df['Name'].fillna('Unkown Unkown', inplace=True) # fill missing values with unkown
tmp = np.array(df['Name'].apply(lambda x: x if type(x)==float else x.split(' ')).to_list())
#Split the names

In [None]:
df['Name_first'] = tmp[:, 0]
df['Name_last'] = tmp[:, 1]

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encoder = LabelEncoder()
df["Name_first"] = label_encoder.fit_transform(df.loc[:, "Name_first"])

label_encoder = LabelEncoder()
df["Name_last"] = label_encoder.fit_transform(df.loc[:, "Name_last"])

#This helps model to understand the uniqueness of a name and last name

In [None]:
df.drop(columns="Name", inplace=True)


In [None]:
summary(df)

In [None]:
#As you can see we completed missing values part

In [None]:
categorical_features = ['HomePlanet', 'Destination', 'Cabin_deck']
df = pd.concat([df, pd.get_dummies(df[categorical_features],dtype=int)], axis=1)
df.drop(columns=categorical_features, inplace=True)

In [None]:
#Now split back the train-test data

In [None]:
test_df  = df[train_data.shape[0]:]
train_df = df[:train_data.shape[0]]

# MODELLING

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = train_df.drop('Transported', axis=1)


In [None]:
y = train_df['Transported'].astype(int)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.15, random_state=42)

In [None]:
import lazypredict
from lazypredict.Supervised import LazyClassifier

In [None]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
print(models)

In [None]:
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier,RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


In [None]:
pipelines = {
    'adaboost': make_pipeline(StandardScaler(), AdaBoostClassifier(random_state=1234)),
    'xgboost': make_pipeline(StandardScaler(), XGBClassifier(random_state=1234)),
    'catboost' : make_pipeline(StandardScaler(), CatBoostClassifier(random_state=1234)),
    'gradientboost': make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=1234)),
    'ligthgbm': make_pipeline(StandardScaler(), LGBMClassifier(random_state=1234)),
    'randomforest': make_pipeline(StandardScaler(), RandomForestClassifier(random_state=1234)),
    'logistic': make_pipeline(StandardScaler(), LogisticRegression(random_state=1234)),
    'knn': make_pipeline(StandardScaler(), KNeighborsClassifier())
}

In [None]:
from sklearn.model_selection import GridSearchCV

grid = {
    'adaboost': {
        'adaboostclassifier__n_estimators': [50, 100, 150,],
        'adaboostclassifier__learning_rate': [0.01, 0.05, 0.1,]
    },
    'xgboost': {
        'xgbclassifier__n_estimators': [50, 100, 150,],
        'xgbclassifier__learning_rate': [0.01, 0.05,],
        'xgbclassifier__max_depth': [3, 4],
        'xgbclassifier__gamma': [0.1, 0.2],
        'xgbclassifier__subsample': [0.6, 0.8]
    },
    'catboost': {
        'catboostclassifier__learning_rate': [0.01, 0.05, 0.1, 0.5],
        'catboostclassifier__depth': [2,3,4], 
        'catboostclassifier__l2_leaf_reg': [1,2,3], 
    },
    'gradientboost': {
        'gradientboostingclassifier__n_estimators': [50, 100],
        'gradientboostingclassifier__learning_rate': [0.01, 0.05, 0.1],
        'gradientboostingclassifier__max_depth': [3, 4],
        'gradientboostingclassifier__min_samples_split': [2, 5]
    },
    'ligthgbm': {
        'lgbmclassifier__n_estimators': [50, 100, 150],
        'lgbmclassifier__learning_rate': [0.01, 0.05, 0.1,],
        'lgbmclassifier__max_depth': [3, 4],
        'lgbmclassifier__min_child_samples': [5, 10, 20]
    },
    'randomforest': {
        'randomforestclassifier__n_estimators': [100, 200],
        'randomforestclassifier__max_depth': [None,4, 5,],
        'randomforestclassifier__min_samples_split': [2, 5, 10]
    },
    'logistic': {
        'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'logisticregression__penalty': ['l1', 'l2']
    },
    'knn': {
        'kneighborsclassifier__n_neighbors': [3, 5, 7, 9],
        'kneighborsclassifier__weights': ['uniform', 'distance']
    }
}


In [None]:

# Create a blank dictionary to hold the models 
fit_models = {}
# Loop through all the algos 
for algo, pipeline in pipelines.items():
  print(f'Training the {algo} model.')
  # Create new Grid Search CV Cclass 
  model = GridSearchCV(pipeline, grid[algo], n_jobs=-1, cv=5,scoring="accuracy")
  # Train the model 
  model.fit(X_train, y_train)
  # Store results inside of the dictionary
  fit_models[algo] = model 

In [None]:
test_df = test_df.drop("Transported",axis=1)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score


In [None]:
# Evaluate the performance of the model 
for algo, model in fit_models.items(): 
  yhat = model.predict(X_test)
  accuracy = accuracy_score(y_test, yhat)
  precision = precision_score(y_test, yhat)
  recall = recall_score(y_test, yhat)
  print(f'Metrics for {algo}: accuracy- {accuracy}, recall- {recall}, precision- {precision}')

# FİNAL MODEL SELECTİON

In [None]:
yhat_test = fit_models['catboost'].predict(test_df)


In [None]:
submission = pd.DataFrame([test_data['PassengerId'], yhat_test]).T
submission.columns = ['PassengerID', 'Transported']

In [None]:
submission.head()

In [None]:
submission["Transported"] = submission["Transported"] == 1


In [None]:
submission.head()

In [None]:
submission.to_csv('submission_SpaceShip-catboost-v1.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
### We get a score from catboost which was the highest but we need to look for better parameters. Catboost will take a long time for grid search
# so i will try to get a better score with xgboost

In [None]:
param_grid = {
        'xgbclassifier__n_estimators': [200,300,400],
        'xgbclassifier__learning_rate': [0.01,0.045,0.05,0.055,0.1,0,2],
        'xgbclassifier__max_depth': [3,4],
        'xgbclassifier__gamma': [0.05,0.1,0.15],
        'xgbclassifier__subsample': [0.4,0.6,0.8]
}

In [None]:
final_model = GridSearchCV(pipelines["xgboost"], param_grid=param_grid, n_jobs=-1, cv=5,scoring="accuracy")

In [None]:
final_model.fit(X,y)

In [None]:
final_model.best_score_

In [None]:
final_model.best_params_

In [None]:
final_model = XGBClassifier(
    gamma=0.05,
    learning_rate=0.05,
    max_depth=3,
    n_estimators=300,
    subsample=0.6
)

In [None]:
final_model.fit(X,y)

In [None]:
#I tried many values in Colab,TPU in another copy of the notebook, so these were the values which i get the best score

In [None]:
best_params = {'learning_rate': 0.019549356549743555,
 'depth': 4,
 'l2_leaf_reg': 6.238880563296214,
 'border_count': 113,
 'verbose' : False}

final_model = CatBoostClassifier(**best_params)

In [None]:
final_model.fit(X,y)

In [None]:
yhat_test = final_model.predict(test_df)

In [None]:
submission = pd.DataFrame([test_data['PassengerId'], yhat_test]).T
submission.columns = ['PassengerID', 'Transported']

In [None]:
submission["Transported"] = submission["Transported"] == 1
submission.head()

In [None]:
submission.to_csv('submission_SpaceShip-catboost-v6.csv', index=False)
print("Your submission was successfully saved!")