# SpaceShip Titanic - MAE
---
#### Version 2

#### imports

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error


#### read-in csv

In [3]:
train_data = pd.read_csv('../Data/SpaceTrain.csv')
test_data = pd.read_csv('../Data/SpaceTest.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'unit4/Data/SpaceTrain.csv'

#### Modify train and test data:
- Fill missing values in 'Cabin' with a placeholder
- Extract 'Deck' and 'Side' from 'Cabin'
- Fill other missing values with appropriate strategies (e.g., mean, mode)
- Encode categorical columns
- Drop columns not needed for training, this is based on the importance counter shown by V1 of SpaceShip Titanic

In [21]:
def modify(data, is_test_data=False):
   
    if not is_test_data:
        data['Transported'] = data['Transported'].astype(int)
    data['Cabin'].fillna('Unknown/0/Unknown', inplace=True)
    
   

    data['Deck'] = data['Cabin'].apply(lambda x: x.split('/')[0] if isinstance(x, str) else 'Unknown')

    data['Side'] = data['Cabin'].apply(lambda x: x.split('/')[2] if isinstance(x, str) else 'Unknown')
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['RoomService'].fillna(0, inplace=True)
    data['FoodCourt'].fillna(0, inplace=True)
    data['ShoppingMall'].fillna(0, inplace=True)
    data['Spa'].fillna(0, inplace=True)
    data['VRDeck'].fillna(0, inplace=True)
    
 
    data = pd.get_dummies(data, columns=['HomePlanet', 'Deck', 'Side'], drop_first=True)

    data.drop(['Cabin', 'Name'], axis=1, inplace=True)
    
    return data

#### process data and dropping the unimportant ones

In [22]:
train_data = train_data.drop(['Destination', 'VIP'], axis=1)
test_data = test_data.drop(['Destination', 'VIP'], axis=1)
train_data = modify(train_data)
X = train_data.drop(['Transported', 'PassengerId'], axis=1)
y = train_data['Transported']

# Split the data into training and validation sets
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.4, random_state=1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Cabin'].fillna('Unknown/0/Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we a

#### Using get_mae to find the most suitable leaf nodes to search so the model wouldn't be overcomplicating to too simple

In [23]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = RandomForestClassifier(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return mae

#### Test out the leaf nodes in the list to determine the best depth

In [24]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500, 700]


mae_values = {max_leaf_nodes: get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
              for max_leaf_nodes in candidate_max_leaf_nodes}
best_tree_size = min(mae_values, key=mae_values.get)

print("MAE values for each max_leaf_nodes:", mae_values)
print("Best max_leaf_nodes:", best_tree_size)

MAE values for each max_leaf_nodes: {5: np.float64(0.26279470960322027), 25: np.float64(0.21822886716503737), 50: np.float64(0.20787809085681427), 100: np.float64(0.199252443933295), 250: np.float64(0.19838987924094306), 500: np.float64(0.19838987924094306), 700: np.float64(0.20270270270270271)}
Best max_leaf_nodes: 250


#### fitting with the best model

In [25]:
final_model = RandomForestClassifier(max_leaf_nodes=best_tree_size, random_state=1)
final_model.fit(X, y)

# Preprocess the test data
test_data_preprocessed = modify(test_data, is_test_data=True)
X_test = test_data_preprocessed.drop(['PassengerId'], axis=1)  # Only drop PassengerId

# Make predictions on the test data
test_predictions = final_model.predict(X_test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Cabin'].fillna('Unknown/0/Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we a

#### output for submission

In [26]:
output = pd.DataFrame({'PassengerId': test_data['PassengerId'],
                       'Transported': test_predictions.astype(bool)})
output

In [27]:
import time
timestr = time.strftime("%Y%m%d-%H%M%S")

output.to_csv(f'../Submission/SpaceShipsubmission{timestr}.csv', index=False)