# **LIB**

In [None]:
pip install feature-engine

In [None]:
pip install catboost

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from feature_engine import transformation as vt 
from feature_engine.imputation import MeanMedianImputer
from catboost import CatBoostClassifier, Pool
from catboost import CatBoostClassifier

# **DATA TRAIN**

## **Data Preprocessing and Feature Engineering**

In [None]:
train_data = 'https://gist.githubusercontent.com/YogaCr/69cdde98515af866e26139712f891969/raw/6ea9682e2d5175b05575a6eddbbafa371ae6a702/train.csv'
train  = pd.read_csv(train_data)

In [None]:
#Cek MissingValues
pd.DataFrame(train.isnull().sum(), columns = ['missing_count']).assign(missing_pct = lambda x: x['missing_count']/train.shape[0]*100) . T


Presentase nilai missing values ~+2% pada variabel uji terkecuali PassengerId dan Transported Status

### **EDA**

In [None]:
#EDA Data Train
train_data  = pd.read_csv(train_data)

In [None]:
#Find Group and No Pass
train_data[["G_Passenger", 'N_Passenger']] = train_data["PassengerId"].str.split('_', expand=True).astype('int')

#Set the Index
train_data.set_index('PassengerId',inplace=True)

#Fiil Na
train_data['Cabin'] = train_data['Cabin'].fillna('T/0/P')

#Find Deck, Num, and Side Pass
train_data[['Deck','Num','Side']] = train_data.Cabin.str.split('/',expand=True)

#Drop Cabin and Name
train_data.drop(['Cabin', 'Name'], axis=1, inplace=True)

### **Feature Engineering**

In [None]:
#Fill Na with Most Freq strategy
imp = SimpleImputer(strategy="most_frequent")
imp.fit_transform(train_data[['HomePlanet']])

train_data['HomePlanet']=imp.fit_transform(train_data[['HomePlanet']])

imp = SimpleImputer(strategy="most_frequent")
imp.fit_transform(train_data[['CryoSleep']])

train_data['CryoSleep']=imp.fit_transform(train_data[['CryoSleep']])

imp = SimpleImputer(strategy="most_frequent")
imp.fit_transform(train_data[['Destination']])

train_data['Destination']=imp.fit_transform(train_data[['Destination']])

imp = SimpleImputer(strategy="most_frequent")
imp.fit_transform(train_data[['VIP']])

train_data['VIP']=imp.fit_transform(train_data[['VIP']])

In [None]:
#Analysis Age
Age_Stat = {}
Age_Stat['Mean'] = train_data['Age'].mean()
Age_Stat['Median'] = train_data['Age'].median()
Age_Stat['Mode'] = train_data['Age'].mode()[0]
Age_Stat

In [None]:
imp = SimpleImputer(strategy="median")
imp.fit_transform(train_data[['Age']])

train_data['Age']=imp.fit_transform(train_data[['Age']])

In [None]:
#Fill Na with 0
imp = SimpleImputer(strategy='constant', fill_value=0)
train_data['RoomService']=imp.fit_transform(train_data[['RoomService']])
imp = SimpleImputer(strategy='constant', fill_value=0)
train_data['FoodCourt']=imp.fit_transform(train_data[['FoodCourt']])
imp = SimpleImputer(strategy='constant', fill_value=0)
train_data['ShoppingMall']=imp.fit_transform(train_data[['ShoppingMall']])
imp = SimpleImputer(strategy='constant', fill_value=0)
train_data['Spa']=imp.fit_transform(train_data[['Spa']])
imp = SimpleImputer(strategy='constant', fill_value=0)
train_data['VRDeck']=imp.fit_transform(train_data[['VRDeck']])

In [None]:
#Total Spend 
train_data['total_spent']= train_data['RoomService']+ train_data['FoodCourt']+ train_data['ShoppingMall']+ train_data['Spa']+ train_data['VRDeck']

In [None]:
#Age Group Pass
train_data['AgeGroup'] = pd.cut(train_data.Age,bins=[0, 5, 12, 18, 50, 150], labels=['Baby','Child','Teen', 'Adult','Elderly'])
#Fill Value of NA
imp = SimpleImputer(strategy='constant', fill_value='Baby')
train_data['AgeGroup']=imp.fit_transform(train_data[['AgeGroup']])

train_data.isnull().sum()

In [None]:
categorical_cols= ['HomePlanet','CryoSleep','Destination','VIP','Deck','Side','Num','AgeGroup']
for i in categorical_cols:
    print(i)
    le=LabelEncoder()
    arr=np.concatenate([train_data[i]], axis=0).astype(str)
    le.fit(arr)
    train_data[i]=le.transform(train_data[i].astype(str))

In [None]:
train_data['Transported']=train_data['Transported'].replace({True:1,False:0})

train_data.head(5)

## **CatBoost**

In [None]:
x = train_data.drop('Transported',axis=1)
y = train_data['Transported']

x.columns

In [None]:
from catboost import CatBoostClassifier

model=CatBoostClassifier(iterations=3000,
                         eval_metric='Accuracy',
                        verbose=0)

In [None]:
model.fit(x,y)


In [None]:
from sklearn.metrics import accuracy_score
pred=model.predict(x)
print(accuracy_score(y.values,pred))

# **DATA TEST**

## **Data Preprocessing and Feature Engineering**

In [None]:
test = 'https://gist.githubusercontent.com/YogaCr/69cdde98515af866e26139712f891969/raw/6ea9682e2d5175b05575a6eddbbafa371ae6a702/test.csv'
x_test = pd.read_csv(test)

In [None]:
#Cek MissingValues
pd.DataFrame(x_test.isnull().sum(), columns = ['missing_count']).assign(missing_pct = lambda x: x['missing_count']/x_test.shape[0]*100) . T

Presentase nilai missing values ~+2% pada variabel uji terkecuali PassengerId dan Transported Status

### **EDA**

In [None]:
x_test.head(5)

In [None]:
#Find Group and No Pass
x_test[["G_Passenger", 'N_Passenger']] = x_test["PassengerId"].str.split('_', expand=True).astype('int')

#Set the Index
x_test.set_index('PassengerId',inplace=True)

#Fill Na
x_test['Cabin'] = x_test['Cabin'].fillna('T/0/P')

#Find Deck, Num, and Side Pass
x_test[['Deck','Num','Side']] = x_test.Cabin.str.split('/',expand=True)

#Drop Cabin and Name
x_test.drop(['Cabin', 'Name'], axis=1, inplace=True)

### **Feature Engineering**

In [None]:
#Fill Na with Most Freq strategy
imp = SimpleImputer(strategy="most_frequent")
imp.fit_transform(x_test[['HomePlanet']])

x_test['HomePlanet']=imp.fit_transform(x_test[['HomePlanet']])

imp = SimpleImputer(strategy="most_frequent")
imp.fit_transform(x_test[['CryoSleep']])

x_test['CryoSleep']=imp.fit_transform(x_test[['CryoSleep']])

imp = SimpleImputer(strategy="most_frequent")
imp.fit_transform(x_test[['Destination']])

x_test['Destination']=imp.fit_transform(x_test[['Destination']])

imp = SimpleImputer(strategy="most_frequent")
imp.fit_transform(x_test[['VIP']])

x_test['VIP']=imp.fit_transform(x_test[['VIP']])

In [None]:
#Analysis Age
Age_Stat = {}
Age_Stat['Mean'] = x_test['Age'].mean()
Age_Stat['Median'] = x_test['Age'].median()
Age_Stat['Mode'] = x_test['Age'].mode()[0]
Age_Stat

In [None]:
imp = SimpleImputer(strategy="median")
imp.fit_transform(x_test[['Age']])

x_test['Age']=imp.fit_transform(x_test[['Age']])

In [None]:
#Fill Na with 0
imp = SimpleImputer(strategy='constant', fill_value=0)
x_test['RoomService']=imp.fit_transform(x_test[['RoomService']])
imp = SimpleImputer(strategy='constant', fill_value=0)
x_test['FoodCourt']=imp.fit_transform(x_test[['FoodCourt']])
imp = SimpleImputer(strategy='constant', fill_value=0)
x_test['ShoppingMall']=imp.fit_transform(x_test[['ShoppingMall']])
imp = SimpleImputer(strategy='constant', fill_value=0)
x_test['Spa']=imp.fit_transform(x_test[['Spa']])
imp = SimpleImputer(strategy='constant', fill_value=0)
x_test['VRDeck']=imp.fit_transform(x_test[['VRDeck']])

In [None]:
#Total Spend
x_test['total_spent']= x_test['RoomService']+ x_test['FoodCourt']+ x_test['ShoppingMall']+ x_test['Spa']+ x_test['VRDeck']

In [None]:
#Age Group Pass
x_test['AgeGroup'] = pd.cut(x_test.Age,bins=[0, 5, 12, 18, 50, 150], labels=['Baby','Child','Teen', 'Adult','Elderly'])
x_test['AgeGroup']
#Fill Value of NA
imp = SimpleImputer(strategy='constant', fill_value='Baby')
x_test['AgeGroup']=imp.fit_transform(x_test[['AgeGroup']])

x_test.isnull().sum()

In [None]:
categorical_cols= ['HomePlanet','CryoSleep','Destination','VIP','Deck','Side','Num','AgeGroup']
for i in categorical_cols:
    print(i)
    le=LabelEncoder()
    arr=np.concatenate([x_test[i]], axis=0).astype(str)
    le.fit(arr)
    x_test[i]=le.transform(x_test[i].astype(str))

x_test.head(5)

## **CatBoost**

In [None]:
y_pred = model.predict(x_test)
sub=pd.DataFrame({'Transported':y_pred.astype(bool)},index=x_test.index)
sub.head()

# **PREDICTION**

In [None]:
test = 'https://gist.githubusercontent.com/YogaCr/69cdde98515af866e26139712f891969/raw/6ea9682e2d5175b05575a6eddbbafa371ae6a702/test.csv'
sub1 = pd.read_csv(test)

sub1 = sub1['PassengerId']

sub = sub.reset_index()

In [None]:
PID   = pd.DataFrame (sub1,   columns = ['PassengerId'])
Trans = pd.DataFrame (sub,    columns = ['Transported'])

In [None]:
submission = pd.concat([PID, Trans], axis=1)
submission = submission.reset_index()
del submission['index']

submission

In [None]:
submission.to_csv('submission.csv', index=None)