In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Problem Description

The Spaceship Titanic was an interstellar passenger liner launched a month ago. With almost 13,000 passengers on board, the vessel set out on its maiden voyage transporting emigrants from our solar system to three newly habitable exoplanets orbiting nearby stars.

While rounding Alpha Centauri en route to its first destination—the torrid 55 Cancri E—the unwary Spaceship Titanic collided with a spacetime anomaly hidden within a dust cloud. Sadly, it met a similar fate as its namesake from 1000 years before. Though the ship stayed intact, almost half of the passengers were transported to an alternate dimension!

To help rescue crews and retrieve the lost passengers, you are challenged to **predict which passengers were transported by the anomaly using records recovered from the spaceship’s damaged computer system.**

# Sources 

I would like to thank [Tracy Porter](http://www.kaggle.com/code/tracyporter/spaceship-t?kernelSessionId=88889403) for her notebook and [video](http://www.youtube.com/watch?v=ParD52xRNsQ) walk through of this competition.

The article [How to Improve Logistic Regression?](https://medium.com/analytics-vidhya/how-to-improve-logistic-regression-b956e72f4492#id_token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjA3NGI5MjhlZGY2NWE2ZjQ3MGM3MWIwYTI0N2JkMGY3YTRjOWNjYmMiLCJ0eXAiOiJKV1QifQ.eyJpc3MiOiJodHRwczovL2FjY291bnRzLmdvb2dsZS5jb20iLCJuYmYiOjE2NTg3MDkyMjEsImF1ZCI6IjIxNjI5NjAzNTgzNC1rMWs2cWUwNjBzMnRwMmEyamFtNGxqZGNtczAwc3R0Zy5hcHBzLmdvb2dsZXVzZXJjb250ZW50LmNvbSIsInN1YiI6IjEwNTEzMTMwMTUxODk2MTgyODkzMCIsImVtYWlsIjoicGptNjMwMDFAZ21haWwuY29tIiwiZW1haWxfdmVyaWZpZWQiOnRydWUsImF6cCI6IjIxNjI5NjAzNTgzNC1rMWs2cWUwNjBzMnRwMmEyamFtNGxqZGNtczAwc3R0Zy5hcHBzLmdvb2dsZXVzZXJjb250ZW50LmNvbSIsIm5hbWUiOiJQZWFjZSBNYWRkb3giLCJwaWN0dXJlIjoiaHR0cHM6Ly9saDMuZ29vZ2xldXNlcmNvbnRlbnQuY29tL2EtL0FGZFp1Y3JYaTlsQ2tidG1TMTNrZkZUdzRmNXQ1SUhiZ3F0aGVFQ1hLa214cEE9czk2LWMiLCJnaXZlbl9uYW1lIjoiUGVhY2UiLCJmYW1pbHlfbmFtZSI6Ik1hZGRveCIsImlhdCI6MTY1ODcwOTUyMSwiZXhwIjoxNjU4NzEzMTIxLCJqdGkiOiJjOTJiMWNkNzJlMGRkMGQ5OWY2NTFiY2UyMTY2Y2U1MzFhODQ1MDI2In0.CDG3cRpuMobfWlB6YcFHVe3rm0srFHbFiHtjTNa5vYhybUMY-HLhKBE9H11Egw3q8PZBu-jx9m7s-ciLJEY_O7DK2swic2SAhUypr2pLI6ws_kQrseh4gm2boBB68WhD2yFetAHDL_u6zNNAMv7-LCkRowPzpWkwRjvhII9iN1RWTu1tIJ9kge2EH1fZTLJL6vjHspcVAl5Cv2ufZ5N1_8Z_nm6M1CHZ9OLtArgq7-jR0NsCo_hcraCZjmlWoeoUSLMpGQCkid_kGrnszHhMVDKsIvmcyMQgT6ytuNGr6Xs_sgvqPia0sROYb3W3VjqjRpNaAax7bMvoFCU1nB_fwg) explains how to tune ML models.




# Procedure 

- Exploratory data analysis
- Feature engineering 
- Processing the data
- Building the model

In [None]:
# Import libraries

import matplotlib.pyplot as plt
from matplotlib import patches

import seaborn as sns

import warnings

import statsmodels.api as sm #Cross-sectional models and methods.
import statsmodels.formula.api as smf #A convenience interface for specifying models using formula strings and DataFrames

import sklearn

# Function to deal with missing values via imputation
from sklearn.impute import SimpleImputer

# Function that converts categorical values into numerical values via ordinal encoding or one-hot encoding
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Function to split data into different groups
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold

# Statistics functions
from scipy.stats import norm
from scipy import stats
from scipy.stats import chi2_contingency
from scipy.stats import chi2

# Suppressing a warning 
warnings.filterwarnings("ignore") 

# It is a magic function that renders the figure in the notebook
%matplotlib inline 

# Changing the figure size of a seaborn axes 
sns.set(rc={"figure.figsize": (20, 15)})

# The style parameters control properties like the color of the background and whether a grid is enabled by default.
sns.set_style("whitegrid")

print("Import complete")

In [None]:
# Load the data
train_df = pd.read_csv('../input/spaceship-titanic/train.csv')
test_df = pd.read_csv('../input/spaceship-titanic/test.csv')
combine = [train_df, test_df]
submission_df = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')

# Exploratory Data Analysis

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe().T

In [None]:
train_df.describe(include='O').T

In [None]:
test_df.head()

In [None]:
test_df.info()

In [None]:
test_df.describe()

In [None]:
submission_df.head()

In [None]:
submission_df.info()

In [None]:
submission_df.describe()

## Data cleaning

In [None]:
# Checking for missing data
s = pd.Series([1, 2, 3, np.nan, np.nan, 4])

pd.notnull(s)

In [None]:
train_df.isnull().sum()

In [None]:
train1 = train_df.dropna()
train1

In [None]:
# Analyse target (Transpotrted)
train1['Transported'].replace({False: 0, True: 1},inplace=True)
train1['Transported']

0 is false and 1 is true

In [None]:
sns.displot(train1['Transported'])

In [None]:
trans_count = train1['Transported'].value_counts()
trans_count

In [None]:
trans_percent = trans_count / len(train1)
trans_percent

In [None]:
plt.figure(figsize=(25, 7))
ax = plt.subplot()
ax.scatter(train1[train1['Transported'] == 1]['Age'], train1[train_df['Transported'] == 1]['FoodCourt'], c='green', s=train1[train1['Transported'] == 1]['VRDeck'])
ax.scatter(train1[train1['Transported'] == 0]['Age'], train1[train_df['Transported'] == 0]['FoodCourt'], c='red', s=train1[train1['Transported'] == 0]['VRDeck']);

In [None]:
# Drop the target 
target = train1['Transported']

train1.drop(['Transported'],axis=1, inplace=True)
train1

In [None]:
# combine train and test 
combi = train1.append(test_df)
combi

In [None]:
combi.info()

In [None]:
combi.describe()

In [None]:
# Check for null values
combi.isnull().sum()

In [None]:
# Impute null values
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp = IterativeImputer(random_state=42)

date = pd.Timestamp('2200-01-01')

for col in combi:
    if combi[col].dtype=="object":
        combi[col].fillna("not listed", inplace=True)
    if combi[col].dtype=="int":
        #X[col].fillna(X[col].mode()[0], inplace=True)
        combi[col].fillna(combi[col].mean(), inplace=True)
        #combi[col] = combi[col].astype.int()
    if combi[col].dtype=='float':
       #X[col].fillna(X[col].mean(), inplace=True)
       combi[col] = imp.fit_transform(combi[col].values.reshape(-1,1))
    if combi[col].dtype=="datetime64[ns]":
        combi[col].fillna(date, inplace=True)
combi

In [None]:
sns.heatmap(train1.isnull(),yticklabels=False,cbar=False)

## Data visualization 

In [None]:
# Home Planet
sns.displot(combi['HomePlanet'])

In [None]:
home_count = combi['HomePlanet'].value_counts()
home_count

In [None]:
home_percent = home_count / len(combi)
home_percent

In [None]:
mylabels = ["Earth", "Europa", "Mars", "not listed"]
plt.pie(home_percent, labels=mylabels)
plt.show() 

In [None]:
combi['HomePlanet'].replace({"Earth": 1, "Europa": 2, "Mars": 3, "not listed": 4},inplace=True)
combi['HomePlanet']

In [None]:
combi['CryoSleep'][combi['CryoSleep'] == 'not listed'] = False

In [None]:
print(combi.iloc[6674])

In [None]:
combi['CryoSleep'].replace({False: 0, True: 1})

In [None]:
sns.distplot(combi['CryoSleep'])

In [None]:
sleep_count = combi['CryoSleep'].value_counts()
sleep_count

In [None]:
sleep_percent = sleep_count / len(combi)
sleep_percent

In [None]:
combi['CryoSleep'] = combi['CryoSleep'].astype(int)
combi['CryoSleep']

In [None]:
# Destination
sns.displot(combi['Destination'])

In [None]:
dest_count = combi['Destination'].value_counts()
dest_count

In [None]:
dest_percent = dest_count / len(combi)
dest_percent

In [None]:
mylabels = ["TRAPPIST-1e", "55 Cancri e", "PSO J318.5-22", "not listed"]
plt.pie(dest_percent, labels=mylabels)
plt.show() 

In [None]:
combi['Destination'].replace({"TRAPPIST-1e": 1, "55 Cancri e": 2, "PSO J318.5-22": 3, "not listed": 4},inplace=True)
combi['Destination']

In [None]:
# Age
combi['Age_group'] = pd.cut(x=combi['Age'], bins=[-1, 18, 40, 65, 100], labels=['child', 'young adult', 'middle age', 'pensioner'])
combi['Age_group']

In [None]:
sns.displot(combi['Age_group'])

In [None]:
age_count = combi['Age_group'].value_counts()
age_count

In [None]:
age_percent = age_count / len(combi)
age_percent

In [None]:
mylabels = ["young adult", "child", "middle age", "pensioner"]
plt.pie(age_percent, labels=mylabels)
plt.show()

In [None]:
combi['Age_group'].replace({"young adult": 1, "child": 2, "middle age": 3, "pensioner": 4},inplace=True)
combi['Age_group']

In [None]:
combi['Age_group'] = combi['Age_group'].astype(int)

In [None]:
# VIP
combi['VIP'][combi['VIP'] == 'not listed'] = False

In [None]:
combi['VIP'].replace({False: 0, True: 1})

In [None]:
sns.distplot(combi['VIP'])

In [None]:
vip_count = combi['VIP'].value_counts()
vip_count

In [None]:
vip_percent = vip_count / len(combi)
vip_percent

In [None]:
combi['VIP'] = combi['VIP'].astype(int)
combi['VIP']

In [None]:
# Room Service
sns.violinplot(combi['RoomService'])

In [None]:
rm_service_high = combi['RoomService'].max()
print(rm_service_high)

In [None]:
combi['Room_Service_group'] = pd.cut(x=combi['RoomService'], bins=[-1, 2000, 8000, 12000], labels=['low', 'med', 'high'])
combi['Room_Service_group']

In [None]:
sns.displot(combi['Room_Service_group'])

In [None]:
rm_service_count = combi['Room_Service_group'].value_counts()
rm_service_count

In [None]:
rm_service_percent = rm_service_count / len(combi)
rm_service_percent

In [None]:
mylabels = ["low", "med", "high"]
plt.pie(rm_service_percent, labels=mylabels)
plt.show()

In [None]:
combi['Room_Service_group'].replace({"low": 1, "med": 2, "high": 3},inplace=True)
combi['Room_Service_group']

In [None]:
# Food
sns.violinplot(combi['FoodCourt'])

In [None]:
food_high = combi['FoodCourt'].max()
print(food_high)

In [None]:
combi['Food_Court_group'] = pd.cut(x=combi['FoodCourt'], bins=[-1, 5000, 20000, 30000], labels=['low', 'med', 'high'])
combi['Food_Court_group']

In [None]:
sns.displot(combi['Food_Court_group'])

In [None]:
fd_court_count = combi['Food_Court_group'].value_counts()
fd_court_count

In [None]:
fd_court_percent = fd_court_count / len(combi)
fd_court_percent

In [None]:
mylabels = ["low", "med", "high"]
plt.pie(fd_court_percent, labels=mylabels)
plt.show()

In [None]:
combi['Food_Court_group'].replace({"low": 1, "med": 2, "high": 3},inplace=True)
combi['Food_Court_group']

In [None]:
# Shopping mall
sns.violinplot(combi['ShoppingMall'])

In [None]:
shop_high = combi['ShoppingMall'].max()
print(shop_high)

In [None]:
combi['Shopping_group'] = pd.cut(x=combi['ShoppingMall'], bins=[-1, 2000, 8000, 13000], labels=['low', 'med', 'high'])
combi['Shopping_group']

In [None]:
sns.displot(combi['Shopping_group'])

In [None]:
shopping_count = combi['Shopping_group'].value_counts()
shopping_count

In [None]:
shopping_percent = shopping_count / len(combi)
shopping_percent

In [None]:
mylabels = ["low", "med", "high"]
plt.pie(shopping_percent, labels=mylabels)
plt.show()

In [None]:
combi['Shopping_group'].replace({"low": 1, "med": 2, "high": 3},inplace=True)
combi['Shopping_group']

In [None]:
# Spa
sns.violinplot(combi['Spa'])

In [None]:
spa_high = combi['Spa'].max()
print(spa_high)

In [None]:
combi['Spa_group'] = pd.cut(x=combi['Spa'], bins=[-1, 5000, 15000, 23000], labels=['low', 'med', 'high'])
combi['Spa_group']

In [None]:
sns.displot(combi['Spa_group'])

In [None]:
spa_count = combi['Spa_group'].value_counts()
spa_count

In [None]:
spa_percent = spa_count / len(combi)
spa_percent

In [None]:
mylabels = ["low", "med", "high"]
plt.pie(spa_percent, labels=mylabels)
plt.show()

In [None]:
combi['Spa_group'].replace({"low": 1, "med": 2, "high": 3},inplace=True)
combi['Spa_group']

In [None]:
# VR deck
sns.violinplot(combi['VRDeck'])

In [None]:
vr_high = combi['VRDeck'].max()
print(vr_high)

In [None]:
combi['VR_group'] = pd.cut(x=combi['VRDeck'], bins=[-1, 5000, 15000, 23000], labels=['low', 'med', 'high'])
combi['VR_group']

In [None]:
sns.displot(combi['VR_group'])

In [None]:
vr_count = combi['VR_group'].value_counts()
vr_count

In [None]:
vr_percent = vr_count / len(combi)
vr_percent

In [None]:
mylabels = ["low", "med", "high"]
plt.pie(vr_percent, labels=mylabels)
plt.show()

In [None]:
combi['VR_group'].replace({"low": 1, "med": 2, "high": 3},inplace=True)
combi['VR_group']

# Build the Model

In [None]:
# Assign features 
combi.info()

In [None]:
# Define X and y
features = ["HomePlanet", "CryoSleep", "Destination", "Age_group", "Room_Service_group", "Food_Court_group", "Shopping_group", "Spa_group", "VR_group"]

y = target
X = combi[features][: len(train1)]
X_test = combi[features][len(train1) :]

In [None]:
# Heatmap
cmap = combi[features].corr()
sns.heatmap(cmap)

In [None]:
# Split dataset for training and validating
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=1, stratify=y, shuffle=True)
X_train.shape, X_val.shape, y_train.shape, y_val.shape, X_test.shape

In [None]:
# Select model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=42, C=10).fit(X_train, y_train)
print(model.score(X_train, y_train))

In [None]:
# Predict validation
y_pred = model.predict(X_val)
print(model.score(X_val, y_val))

In [None]:
# Confusion matrix 
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_val, y_pred))

In [None]:
# Predict on X_test
predictions = model.predict(X_test)
predictions = predictions.astype(str)
predictions = np.char.replace(predictions, '0', 'False')
predictions = np.char.replace(predictions, '1', 'True')
predictions

To obtain model prediction on testing data to evaluate the model’s accuracy and efficiency.

In [None]:
# Prepare and submit 
submission_df['Transported'] = predictions
submission_df.to_csv('submission.csv', index=False)
my_submission = pd.read_csv("submission.csv")
my_submission