<a href="https://colab.research.google.com/github/RodyRuan/Nave_Espacial_Titanic/blob/main/Nave_Espacial_Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [10]:
# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Extract PassengerId before preprocessing
test_passenger_ids = test_data['PassengerId']


In [11]:
# Fill missing values and encode categorical features
def preprocess_data(data, is_train=True):
    # Fill missing values for numeric columns with the median
    imputer = SimpleImputer(strategy='median')
    data[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = imputer.fit_transform(data[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']])

    # Fill missing values for categorical columns with the most frequent value
    imputer_cat = SimpleImputer(strategy='most_frequent')
    data[['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']] = imputer_cat.fit_transform(data[['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']])

    # Encode categorical variables
    label_encoder = LabelEncoder()
    data['HomePlanet'] = label_encoder.fit_transform(data['HomePlanet'])
    data['CryoSleep'] = label_encoder.fit_transform(data['CryoSleep'])
    data['Destination'] = label_encoder.fit_transform(data['Destination'])
    data['VIP'] = label_encoder.fit_transform(data['VIP'])

    # Split Cabin into deck, num, and side
    data['Deck'] = data['Cabin'].apply(lambda x: x.split('/')[0])
    data['Num'] = data['Cabin'].apply(lambda x: x.split('/')[1])
    data['Side'] = data['Cabin'].apply(lambda x: x.split('/')[2])

    # Encode Deck and Side
    data['Deck'] = label_encoder.fit_transform(data['Deck'])
    data['Side'] = label_encoder.fit_transform(data['Side'])

    # Drop the Cabin and Name columns as they are no longer needed
    data = data.drop(columns=['Cabin', 'Name'])

    if is_train:
        return data
    else:
        return data.drop(columns=['PassengerId'])

train_data = preprocess_data(train_data)
X_test = preprocess_data(test_data, is_train=False)


In [12]:
# Separate features and target variable from training data
X = train_data.drop(columns=['Transported', 'PassengerId'])
y = train_data['Transported']

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)


In [13]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# Train a RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [15]:
# Validate the model
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')


Validation Accuracy: 0.7947096032202415


In [16]:
# Make predictions on the test data
test_predictions = model.predict(X_test)

# Prepare the submission file
submission = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Transported': test_predictions
})

submission['Transported'] = submission['Transported'].astype(bool)
submission.to_csv('submission.csv', index=False)
