<a href="https://colab.research.google.com/github/OscarHill/OscarHill/blob/main/Starship_Titanic_Attempt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# Load the dataset
train_data = pd.read_csv('train.csv')

# Separate features and target variable from training data
X_train = train_data.drop(['Transported'], axis=1)  # Assuming 'Transported' is the target variable
y_train = train_data['Transported']

# Load test or validation data
test_data = pd.read_csv('test.csv')

# Assuming test_data does not contain the 'Transported' column
X_test = test_data.copy()

# List of numerical and categorical features (adjust based on your dataset)
numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
categorical_features = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination']

In [44]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Bundle preprocessing and modeling code in a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)
                          ])

# Preprocessing of training data and fit model
pipeline.fit(X_train, y_train)



In [45]:
# Preprocessing of test data and make predictions
predictions = pipeline.predict(X_test)

In [46]:
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Transported': predictions})
submission.to_csv('submission.csv', index=False)

In [50]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# Assuming X_train and y_train are already defined and preprocessed

# Define the preprocessing steps for the pipeline
numerical_transformer = SimpleImputer(strategy='median')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model with the pipeline
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', RandomForestClassifier(random_state=42))
                             ])

# Define the grid of parameters to search
param_grid = {
    'model__n_estimators': [100, 200, 300],  # Number of trees
    'model__max_depth': [None, 10, 20, 30],  # Maximum depth of trees
    'model__min_samples_split': [2, 5, 10]  # Minimum number of samples required to split an internal node
}

# Set up the GridSearchCV
grid_search = GridSearchCV(estimator=rf_pipeline, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters found
print("Best parameters:", grid_search.best_params_)

# Best score achieved
print("Best score:", grid_search.best_score_)

# You can then use the best estimator directly for predictions or further analysis
best_model = grid_search.best_estimator_


Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100; total time=   9.6s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100; total time=   5.9s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100; total time=   6.3s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100; total time=   5.6s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100; total time=   6.6s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=200; total time=  11.9s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=200; total time=  11.5s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=200; total time=  12.0s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=200; total time=  11.7s
[C

In [51]:
# Assuming 'X_test' is your test dataset prepared similarly to 'X_train'
# and 'PassengerId' is a column in your test dataset to identify passengers

# Step 2: Use the best model to make predictions on the test dataset
predictions = best_model.predict(X_test)

# Convert predictions to the format expected in the submission (e.g., True/False for 'Transported')
# This step might not be necessary if your model directly outputs the required format
predictions_formatted = [True if pred == 1 else False for pred in predictions]

# Step 3: Create a DataFrame with 'PassengerId' and your predictions
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],  # Adjust 'test_data' to the variable holding your test dataset
    'Transported': predictions_formatted
})

# Save the submission file
submission.to_csv('submission_better.csv', index=False)