In [73]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy

import matplotlib.pyplot as plt

In [74]:
train_data = pd.read_csv("./train.csv")
test_data = pd. read_csv("./test.csv")


In [75]:
missing_in_train = sum([True for idx,row in train_data.iterrows() if any(row.isnull())])
missing_in_test = sum([True for idx,row in test_data.iterrows() if any(row.isnull())])

f"There are {missing_in_train} missing rows in train and {missing_in_test} missing rows in test"

'There are 2087 missing rows in train and 996 missing rows in test'

In [85]:
def make_frame_ready(source_frame: pd.DataFrame) -> pd.DataFrame:
    

    #make new column group from the passengerId
    source_frame["Group"] = source_frame["PassengerId"].apply(lambda x: x.split("_")[0])

    #make new column family from the name of the passenger
    source_frame["Family"] = source_frame["Name"].apply(lambda x: str(x).split(" ")[-1])


    #impute missing family from group
    #source_frame["Family"] = source_frame.groupby("Group")["Family"].ffill().bfill()
    source_frame["Family"] = source_frame["Family"].fillna(source_frame.groupby("Group")["Family"].agg(lambda x: pd.Series.mode(x, dropna=True)))

    #TODO impute missing cabins from families
    source_frame["Cabin"] = source_frame["Cabin"].fillna(source_frame.groupby("Group")["Cabin"].agg(lambda x: pd.Series.mode(x, dropna=True)))
    source_frame["Cabin"].ffill(inplace=True)

    #split cabin infor into three parts
    source_frame[["Deck", "Num", "shipSide"]] = source_frame["Cabin"].str.split("/", expand=True)
    source_frame["Num"] = source_frame["Num"].astype(np.float64)

    #Put cabin number into bins
    source_frame["NumBin"] = pd.cut(source_frame["Num"], bins=12)

    #create age bins
    source_frame["AgeBin"] = pd.cut(source_frame["Age"], bins=12)

    #set spending for cryosleepers
    source_frame.loc[source_frame["CryoSleep"] == True ,["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]] = 0.0

    # set spending of all kids to zero
    source_frame.loc[source_frame["Age"] <= 12, ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]] = 0.0

    #create totalSpending column
    source_frame["totalSpent"] = source_frame[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].sum(axis=1)

    #set age of all people not spending to average age for people 12 and under
    source_frame["Age"] = np.where((source_frame.CryoSleep == False) & (
    source_frame.Age.isna()) & (source_frame.totalSpent == 0), 5, source_frame.Age)
    
    #impute VIP status by spending   
    source_frame.loc[(source_frame.VIP.isnull()) & (source_frame.totalSpent > 3500), "VIP"] = True
    source_frame["VIP"].fillna(False, inplace=True)


    source_frame.fillna(source_frame.select_dtypes(include=np.float64).mean().iloc[0], inplace=True)

    source_frame["NumBin"] = source_frame["NumBin"].cat.add_categories("unknown")
    source_frame["NumBin"].fillna("unknown", inplace=True)

    source_frame["AgeBin"] = source_frame["AgeBin"].cat.add_categories("unknown")
    source_frame["AgeBin"].fillna("unknown", inplace=True)

    source_frame.fillna(source_frame.select_dtypes(include=[object], exclude="category").mode().iloc[0], inplace=True)

    #source_frame.select_dtypes([object]).fillna(lambda x: x.mode().iloc[0], inplace=True)


    return source_frame

In [86]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   PassengerId   8693 non-null   object  
 1   HomePlanet    8492 non-null   object  
 2   CryoSleep     8476 non-null   object  
 3   Cabin         8693 non-null   object  
 4   Destination   8511 non-null   object  
 5   Age           8521 non-null   float64 
 6   VIP           8693 non-null   bool    
 7   RoomService   8586 non-null   float64 
 8   FoodCourt     8587 non-null   float64 
 9   ShoppingMall  8590 non-null   float64 
 10  Spa           8579 non-null   float64 
 11  VRDeck        8586 non-null   float64 
 12  Name          8493 non-null   object  
 13  Transported   8693 non-null   bool    
 14  Group         8693 non-null   object  
 15  Family        8693 non-null   object  
 16  Deck          8693 non-null   object  
 17  Num           8693 non-null   float64 
 18  shipSide

In [87]:
train_data = make_frame_ready(train_data)

TypeError: Cannot setitem on a Categorical with a new category (28.808355826780893), set the categories first

In [None]:
test_data = make_frame_ready(test_data)#[["HomePlanet", "CryoSleep", "Destination", "Age", "VIP", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Deck", "shipSide", "NumBin", "AgeBin", "totalSpent"]]

In [None]:
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [None]:
train_data = train_data[["HomePlanet", "CryoSleep", "Destination", "Age", "VIP", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Deck", "shipSide", "NumBin", "AgeBin", "totalSpent", "Transported"]]

In [None]:
train_data.select_dtypes(np.number).columns.to_list()

['Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'totalSpent']

In [None]:
ct_X = ColumnTransformer(
    [
        ("num_impute", SimpleImputer(strategy="mean"), ['Age',
                                                        'RoomService',
                                                        'FoodCourt',
                                                        'ShoppingMall',
                                                        'Spa',
                                                        'VRDeck',
                                                        'totalSpent']),
        ("num_scale", RobustScaler(), ['Age',
                                       'RoomService',
                                       'FoodCourt',
                                       'ShoppingMall',
                                       'Spa',
                                       'VRDeck',
                                       'totalSpent']),

        ("cat_impute", SimpleImputer(strategy="most_frequent"),
         ['HomePlanet',
          'CryoSleep',
          'Destination',
          'Deck',
          'shipSide',
          'NumBin',
          'AgeBin']),
        ("cat_encode", OneHotEncoder(), ['HomePlanet',
                                         'CryoSleep',
                                         'Destination',
                                         'Deck',
                                         'shipSide',
                                         'NumBin',
                                         'AgeBin'
                                         ])
    ]
)


In [None]:
X = train_data.drop(columns=["Transported"])
y = train_data["Transported"]

In [None]:
X = ct_X.fit_transform(X)

TypeError: Encoders require their input to be uniformly strings or numbers. Got ['function', 'str']

In [None]:
from sklearn.svm import SVC

s = SVC().fit(X, y)

ValueError: could not convert string to float: 'Europa'

In [None]:
missing_in_train = sum([True for idx,row in train_data.iterrows() if any(row.isnull())])
missing_in_test = sum([True for idx,row in test_data.iterrows() if any(row.isnull())])

f"There are {missing_in_train} missing rows in train and {missing_in_test} missing rows in test"

'There are 873 missing rows in train and 441 missing rows in test'