In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
train_set = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_set = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [3]:
train_set.info()

All features have missing values, except `PassengerId`.

In [4]:
test_set.info()

In [5]:
train_set.head()

Before diving more into the dataset, I will 

- create `Deck` and `Side` from `Cabin`
- create `GroupSize` from `PassengerId`
- remove `Name`, `PassengerId`, `Cabin`

In [6]:
# create 'Deck' and 'Side' from 'Cabin'
deck, side = [], []

bool = train_set["Cabin"].isnull()
for i in range(len(train_set)):
    cabin = train_set["Cabin"][i]
    if bool[i]:
        deck.append(np.NaN)
        side.append(np.NaN)
    else:
        comp = cabin.split('/')
        deck.append(comp[0])
        side.append(comp[2])
        
train_set["Deck"] = deck
train_set["Side"] = side

In [7]:
# create 'GroupSize' from 'PassengerId'
train_set['GroupSize'] = [pid.split('_')[1] for pid in train_set.PassengerId]

In [8]:
# Drop 'Name', 'PassengerId', 'Cabin'
train_set = train_set.drop(['Name', 'PassengerId', 'Cabin'], axis=1)

# **Brief Look at train and test sets**

In [9]:
train_set.info()

**Observations:**

- Categorical variables:

     `HomePlanet`,`CryoSleep`, `Destination`, `VIP`,  `Deck`, `Side`, `GroupSize`

- Numerical variables:

     `Age`, `RoomService`, `FoodCourt`, `ShoppingMall`, `Spa`, `VRDeck`

In [10]:
cat_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side', 'GroupSize']
num_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [11]:
train_set.describe()

# **Visualization and Exploration**

1. Histogram for labels 

2. Histogram for each categorical variable

3. Scatterplot for each numerical variable

4. Correlation for each numerical variable

In [12]:
# Histogram for labels
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns


Transported_int = np.zeros([len(train_set)])
Transported_int[train_set['Transported'] == True] = 1
sns.displot(Transported_int)

**Observation:**

Same size for each label.

In [13]:
# Histogram for each categorical variable
fig, axes = plt.subplots(4, 2, figsize=(40,20))
ax = [axes[0,0], axes[0,1], axes[1,0], axes[1,1], axes[2,0], axes[2,1], axes[3,0]]

for i in range(len(cat_features)):
    feature = cat_features[i]
    sns.countplot(x=feature, hue='Transported', data=train_set, ax=ax[i])

In [14]:
for feature in cat_features:
    s = train_set.groupby([feature])['Transported'].value_counts(normalize=True)
    print(s)

In [15]:
# Scatterplot for each numerical variable

fig, axes = plt.subplots(3, 2, figsize=(20,10))
ax = [axes[0,0], axes[0,1], axes[1,0], axes[1,1], axes[2,0], axes[2,1]]

for i in range(len(num_features)):
    feature = num_features[i]
    sns.set(style="darkgrid")
    sns.histplot(data=train_set, x=feature, hue="Transported", bins=50, multiple='stack',
                ax=ax[i])

In [16]:
# correlation for numerical variables
corr_matrix = train_set.corr()
corr_matrix

In [17]:
corr_matrix["Transported"]

**Observations:**

- `RoomService`, `Spa`, `VRDeck` are correlated with `Transported`.
-  Notice the correlation between `VRDeck`, `FoodCourt`, `Spa`.

# **Split the train_set into a smaller train_set and a test set**

Split the set into 2 groups such that both have same distributions of 'Transported'.

In [18]:
# Split the train_set into a smaller train_set and a test set
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, val_index in split.split(train_set, train_set['Transported']):
    strain_set = train_set.loc[train_index]
    val_set = train_set.loc[val_index]

strain_y = strain_set['Transported']
strain_x = strain_set.drop('Transported', axis=1)

val_y = val_set['Transported']
val_x = val_set.drop('Transported', axis=1)

# **Pipeline: Data cleaning + Addition of new features**

Implement the following tasks:

1.  categorical features: 
        
       - group `GroupSize` into 5 groups 
       
       - group `Age` into 6 groups
       
       - replace missing values with respective most frequent categories
        
       - apply one-hot encoding for categorical features
2.  numerical features:

       - replace missing values with respective medians
      
       - apply standardization
    

**Note: the following code requires simplication!**

In [52]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

cat_features = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'Deck', 'Side', 'GroupSize']
num_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

class GroupSizeSplits(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        groupsize = np.array(X["GroupSize"], dtype='int').copy()
        labels = np.zeros(len(X), dtype='object')
        for i in range(len(X)):
            if (groupsize[i]>=2 and groupsize[i]<5):
                labels[i]=1
            elif (groupsize[i]>=5 and groupsize[i]<7):
                labels[i]=2
            elif (groupsize[i]==7):
                labels[i]=3
            elif (groupsize[i]>=8):
                labels[i]=4
        X_new = X.copy()
        X_new["GroupSize"]=labels
        return X_new

class AgeSplits(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        age = np.array(X["Age"])
        labels = np.zeros(len(X), dtype='object')
        for i in range(len(X)):
            if age[i] >18 and age[i] <=28:
                labels[i]=1
            elif age[i] >28 and age[i] <=38:
                labels[i]=2
            elif age[i] >38 and age[i] <=48:
                labels[i]=3
            elif age[i] >48 and age[i] <=58:
                labels[i]=4
            elif age[i] >58:
                labels[i]=5
        X_new = X.copy()
        X_new["Age"]=labels
        return X_new

class ImputerOneHot_df(BaseEstimator, TransformerMixin):
    #column_names = []
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        imputer = SimpleImputer(strategy="most_frequent")
        onehot_encoder = OneHotEncoder()
        X_impute = imputer.fit_transform(X)
        X_onehot = onehot_encoder.fit_transform(X_impute).toarray()
        onehot_cats = onehot_encoder.categories_
        cat_features = X.columns
        self.column_names = []
        for i in range(len(cat_features)):
            name = cat_features[i]
            cats = onehot_cats[i]
            for cat in cats:
                full_name = f'{name}:{cat}'
                self.column_names.append(full_name) 
        X_new = pd.DataFrame(X_onehot, columns=self.column_names)
        return X_new    

class NumTransformer(BaseEstimator, TransformerMixin):
    #column_names = []
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        imputer = SimpleImputer(strategy="median")
        std_scaler = StandardScaler()
        X_impute = imputer.fit_transform(X)
        X_scaled = std_scaler.fit_transform(X_impute)
        self.column_names = list(X.columns)
        X_new = pd.DataFrame(X_scaled, columns=X.columns)
        return X_new
    
class TotalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, num_features, cat_features):
        self.num_features = num_features
        self.cat_features = cat_features
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        groupsize_encoder = GroupSizeSplits()
        age_encoder = AgeSplits()        
        imputeronehot_encoder = ImputerOneHot_df()
        num_transform = NumTransformer()
        
        cat_pipeline = Pipeline([
            ('groupsize_encoder', groupsize_encoder),
            ('age_encoder', age_encoder),
            ('imputeronehot_encoder', imputeronehot_encoder),
        ])
        X_cat_transform = cat_pipeline.fit_transform(X[self.cat_features])
        X_num_transform = num_transform.fit_transform(X[self.num_features])
        X_new = pd.concat([X_num_transform, X_cat_transform], axis=1)
        self.column_names = X_new.columns
        return X_new


transformer = TotalTransformer(num_features, cat_features)
strain_prepared = transformer.fit_transform(strain_x)
val_prepared = transformer.transform(val_x)

# **Feature Combination / Addition of New Feature(s)**

1. Create `AmountBilled`

2. Create boolean features for `RoomService`, `FoodCourt`, `ShoppingMall`, `Spa`, `VRDeck`  (Excluded)


In [60]:
# Create `AmountBilled` and remove `RoomService`, `FoodCourt`, `ShoppingMall`, `Spa`, `VRDeck` 
lux = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
strain_prepared['AmountBilled'] = strain_prepared[lux].sum(axis=1)

strain_prepared = strain_prepared.drop(lux, axis=1)

In [61]:
val_prepared['AmountBilled'] = val_prepared[lux].sum(axis=1)
val_prepared = val_prepared.drop(lux, axis=1)

# **Data Visualization using Different Dimensionality Reduction Techniques**

    - PCA
    - RBF kernel PCA
    - Locally linear embedding PCA

In [62]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X2D = pca.fit_transform(strain_prepared)
pca.explained_variance_ratio_

In [63]:
cols = np.repeat('blue',len(strain_y))
cols[strain_y == True] = 'red'
plt.scatter(x=X2D[:,0], y=X2D[:,1], color = cols)

In [64]:
# Try kernel PCA
from sklearn.decomposition import KernelPCA

rbf_pca = KernelPCA(n_components=2, kernel="rbf", gamma=0.1)
X2D_kernel = rbf_pca.fit_transform(strain_prepared)
plt.scatter(x=X2D_kernel[:,0], y=X2D_kernel[:,1], color = cols)

In [65]:
# Try Locally Linear Embedding
from sklearn.manifold import LocallyLinearEmbedding

lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)
X2D_lle = lle.fit_transform(strain_prepared)

plt.scatter(x=X2D_lle[:,0], y=X2D_lle[:,1], color = cols)

# **Train the model**

1. Logistic Regression
2. Random Forest
3. Neural Network

In [66]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=300)
log_reg.fit(strain_prepared, strain_y)
y_pred = log_reg.predict(val_prepared)

Accuracy score is used because the distribution of 2 classes are almost equal.

In [67]:
from sklearn.metrics import accuracy_score

accuracy_score(val_y, y_pred)

In [68]:
# Random Forest
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = [
    {
    'max_depth':[4,6,8,10,12]
    
    }
]
rnd_clf = RandomForestClassifier(bootstrap=True, n_estimators=500, n_jobs=-1,
                                oob_score=True)
grid_search = GridSearchCV(rnd_clf, param_grid, cv=5, scoring='accuracy',
                          return_train_score=True, refit=True)
grid_search.fit(strain_prepared, strain_y)

In [69]:
grid_search.best_score_

In [70]:
rnd_best = grid_search.best_estimator_

In [71]:
rnd_best.oob_score_

In [72]:
y_pred = rnd_best.predict(val_prepared)
accuracy_score(val_y, y_pred)

In [73]:
# Checking Feature Importance
feature_imp = pd.DataFrame()
feature_imp['Variable'] = strain_prepared.columns
feature_imp["Imp"] = rnd_best.feature_importances_
feature_imp.sort_values(by='Imp', ascending=False)

In [82]:
from tensorflow import keras
from tensorflow.keras import layers as layers

model = keras.Sequential()
model.add(layers.Dense(60, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(30, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer="RMSprop", loss="binary_crossentropy", metrics=["accuracy"])
history = model.fit(strain_prepared, strain_y, epochs=30, batch_size=100, 
                   validation_data=(val_prepared, val_y))

In [83]:
history.history.keys()

In [84]:
epochs = history.epoch
history = pd.DataFrame(history.history)
accuracy = history['accuracy']
val_accuracy = history['val_accuracy']
loss = history['loss']
val_loss = history['val_loss']

In [85]:
plt.plot(epochs, accuracy, 'b', label='Training accuracy')
plt.plot(epochs, val_accuracy, 'r', label='Validation accuracy')
plt.title('Accuracy')
plt.legend()
plt.show()

plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Loss')
plt.legend()
plt.show()

There is some problem here that the validation accuracy (loss) is higher (lower) than the training accuracy (loss).

Solution(s):

- Check stratitiedsplits

- Check how other models performed on the training set and validation set.

# 