In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../datasets/spaceship-titanic/train.csv")
df.head()

In [None]:
df.columns

In [None]:
df['Transported'].unique()

In [None]:
# DONE 'PassengerId', = completely unique
# DONE 'HomePlanet', = nominal some nan
# 'CryoSleep', = bool some nan
# DONE 'Cabin', = nominal, unknown if has nan
# DONE 'Destination', = 3 nominal some nan
# 'Age', = ratio some nan
# 'VIP', = bool some nan
# 'RoomService', = ratio unknown if has nan
# 'FoodCourt', = ratio unknown if has nan
# 'ShoppingMall', = ratio unknown if has nan
# 'Spa', = ratio unknown if has nan
# 'VRDeck', = ratio unknown if has nan
# 'Name', = Nominal
# 'Transported' = bool only

# DECISION TREE

### Pre-processing

In [None]:
# drop categorical data used for naming
df.drop(columns=['PassengerId', 'Name'], inplace=True)
df

In [None]:
# convert categorical variables to one hot encodings HomePlanet
t_df = pd.get_dummies(df['HomePlanet'])
t_df.rename(columns={
    'Earth': 'isFromEarth',
    'Europa': 'isFromEuropa',
    'Mars':'isFromMars'
}, inplace=True)
new_df = pd.concat([df, t_df], axis=1)
new_df.drop(columns=['HomePlanet'], inplace=True)
new_df

In [None]:
# convert categorical variables to one hot encodings Destination
t_df = pd.get_dummies(df['Destination'])

t_df.rename(columns={
    '55 Cancri e': 'isToDest_Cancri',
    'PSO J318.5-22': 'isToDest_PSO',
    'TRAPPIST-1e':'isToDest_TRAPPIST'
}, inplace=True)
new_df = pd.concat([new_df, t_df], axis=1)
new_df.drop(columns=['Destination'], inplace=True)
new_df

In [None]:
# Separating Cabin to deck and side, ignoring cabin numbers, maybe i should split the cabin numbers into 3 equal parts in case the numbeing is based on location
# filling empty cabin values simply with we dont know (U)
new_df.Cabin.fillna("U/0/U", inplace=True)



In [None]:
new_df['Cabin Deck'] = new_df.Cabin.apply(lambda x: x.split("/")[0])
new_df['Cabin Side'] = new_df.Cabin.apply(lambda x: x.split("/")[2])
new_df.drop(columns=['Cabin'], inplace=True)
new_df

In [None]:
new_df['Cabin Deck'].unique()

In [None]:
# convert categorical variables to one hot encodings Destination
COLUMN_NAME = "Cabin Deck"
COLUMN_RENAME_SCHEME = "isDeck_%s"

t_df = pd.get_dummies(new_df[COLUMN_NAME])

# renaming scheme
col_name = dict()

for label in new_df[COLUMN_NAME].unique():
    col_name[label] = COLUMN_RENAME_SCHEME % (label)

t_df.rename(columns=col_name, inplace=True)
new_df = pd.concat([new_df, t_df], axis=1)
new_df.drop(columns=[COLUMN_NAME], inplace=True)
new_df

In [None]:
new_df['Cabin Side'].unique()

In [None]:
# convert categorical variables to one hot encodings Destination
COLUMN_NAME = "Cabin Side"
COLUMN_RENAME_SCHEME = "isSide_%s"

t_df = pd.get_dummies(new_df[COLUMN_NAME])

# renaming scheme
col_name = dict()

for label in new_df[COLUMN_NAME].unique():
    col_name[label] = COLUMN_RENAME_SCHEME % (label)

t_df.rename(columns=col_name, inplace=True)
new_df = pd.concat([new_df, t_df], axis=1)
new_df.drop(columns=[COLUMN_NAME], inplace=True)
new_df

In [None]:
# Add Unknown for VIP
new_df["VIP"].fillna("unknown", inplace=True)

# convert categorical variables to one hot encodings VIP
COLUMN_NAME = "VIP"
COLUMN_RENAME_SCHEME = "isVIP_%s"

t_df = pd.get_dummies(new_df[COLUMN_NAME])

# renaming scheme
col_name = dict()

for label in new_df[COLUMN_NAME].unique():
    col_name[label] = COLUMN_RENAME_SCHEME % (label)

t_df.rename(columns=col_name, inplace=True)
new_df = pd.concat([new_df, t_df], axis=1)
new_df.drop(columns=[COLUMN_NAME], inplace=True)
new_df

In [None]:
# Add Unknown for Cryosleep
new_df["CryoSleep"].fillna("unknown", inplace=True)

# convert categorical variables to one hot encodings VIP
COLUMN_NAME = "CryoSleep"
COLUMN_RENAME_SCHEME = "isCryoSleep_%s"

t_df = pd.get_dummies(new_df[COLUMN_NAME])

# renaming scheme
col_name = dict()

for label in new_df[COLUMN_NAME].unique():
    col_name[label] = COLUMN_RENAME_SCHEME % (label)

t_df.rename(columns=col_name, inplace=True)
new_df = pd.concat([new_df, t_df], axis=1)
new_df.drop(columns=[COLUMN_NAME], inplace=True)
new_df

In [None]:
# adding average for all "spending" columns
for col in ["Age", "RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]:
    new_df["isUnknown%s" % col] = new_df[col].apply(lambda x: pd.isna(x))

    new_df[col].fillna(new_df[col].mean(), inplace=True)

    new_df[col] = new_df[col].astype(float)


In [None]:
bool_cols = new_df.select_dtypes(include='bool').columns
new_df[bool_cols] = new_df[bool_cols].astype(int)
new_df


In [None]:
# Separate Transported column
transported = new_df["Transported"].copy()
new_df.drop(columns=["Transported"], inplace=True)
transported

In [None]:
# COMBINED
# 1
df.drop(columns=['PassengerId', 'Name'], inplace=True)

# 2
t_df = pd.get_dummies(df['HomePlanet'])
t_df.rename(columns={
    'Earth': 'isFromEarth',
    'Europa': 'isFromEuropa',
    'Mars':'isFromMars'
}, inplace=True)
new_df = pd.concat([df, t_df], axis=1)
new_df.drop(columns=['HomePlanet'], inplace=True)

# 3
t_df = pd.get_dummies(df['Destination'])

t_df.rename(columns={
    '55 Cancri e': 'isToDest_Cancri',
    'PSO J318.5-22': 'isToDest_PSO',
    'TRAPPIST-1e':'isToDest_TRAPPIST'
}, inplace=True)
new_df = pd.concat([new_df, t_df], axis=1)
new_df.drop(columns=['Destination'], inplace=True)

# 4
new_df.Cabin.fillna("U/0/U", inplace=True)

# 5
new_df['Cabin Deck'] = new_df.Cabin.apply(lambda x: x.split("/")[0])
new_df['Cabin Side'] = new_df.Cabin.apply(lambda x: x.split("/")[2])
new_df.drop(columns=['Cabin'], inplace=True)

# 7
COLUMN_NAME = "Cabin Deck"
COLUMN_RENAME_SCHEME = "isDeck_%s"

t_df = pd.get_dummies(new_df[COLUMN_NAME])

# renaming scheme
col_name = dict()

for label in new_df[COLUMN_NAME].unique():
    col_name[label] = COLUMN_RENAME_SCHEME % (label)

t_df.rename(columns=col_name, inplace=True)
new_df = pd.concat([new_df, t_df], axis=1)
new_df.drop(columns=[COLUMN_NAME], inplace=True)

# 8
new_df["CryoSleep"].fillna("unknown", inplace=True)

# convert categorical variables to one hot encodings VIP
COLUMN_NAME = "CryoSleep"
COLUMN_RENAME_SCHEME = "isCryoSleep_%s"

t_df = pd.get_dummies(new_df[COLUMN_NAME])

# renaming scheme
col_name = dict()

for label in new_df[COLUMN_NAME].unique():
    col_name[label] = COLUMN_RENAME_SCHEME % (label)

t_df.rename(columns=col_name, inplace=True)
new_df = pd.concat([new_df, t_df], axis=1)
new_df.drop(columns=[COLUMN_NAME], inplace=True)

# 9
for col in ["Age", "RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]:
    new_df["isUnknown%s" % col] = new_df[col].apply(lambda x: pd.isna(x))

    new_df[col].fillna(new_df[col].mean(), inplace=True)

    new_df[col] = new_df[col].astype(float)

# 10
bool_cols = new_df.select_dtypes(include='bool').columns
new_df[bool_cols] = new_df[bool_cols].astype(int)

# 11
transported = new_df["Transported"].copy()
new_df.drop(columns=["Transported"], inplace=True)
transported


### Preprocessing Finished

In [None]:
new_df.dtypes

In [None]:
from sklearn import tree
X = new_df.copy()
Y = transported
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)

In [None]:
tree.plot_tree(clf)