In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the datasets
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Data Cleaning and Feature Engineering
def preprocess_data(df):
    # Handle missing values
    exp_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df["Expenditure"] = df[exp_features].sum(axis=1)
    df['CryoSleep'] = np.where(df['Expenditure'] == 0, 1, df['CryoSleep'])
    for col in exp_features:
        df[col].fillna(0, inplace=True)
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['HomePlanet'].fillna('Earth', inplace=True)
    df['Destination'].fillna('TRAPPIST-1e', inplace=True)
    df['CryoSleep'].fillna(0, inplace=True)
    df['VIP'].fillna(0, inplace=True)

    # Feature Engineering
    df['CabinDeck'] = df['Cabin'].str.split('/').str[0]
    df['CabinSide'] = df['Cabin'].str.split('/').str[2]
    df.drop(['Name', 'Cabin', 'PassengerId'], axis=1, inplace=True, errors='ignore')

    # Encoding categorical variables
    categorical_cols = ['HomePlanet', 'Destination', 'CabinDeck', 'CabinSide']
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    return df

# Preprocess train and test data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Split features and target
X = train_data.drop('Transported', axis=1)
y = train_data['Transported']

# Scaling features
scaler = StandardScaler()
X = scaler.fit_transform(X)
test_data = scaler.transform(test_data)

# Model Training and Hyperparameter Tuning
rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X, y)

# Best Model
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Predict on the test data
predictions = best_model.predict(test_data)

# Save submission
submission = pd.DataFrame({
    'PassengerId': pd.read_csv("test.csv")['PassengerId'],
    'Transported': predictions
})

submission.to_csv("submission.csv", index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always beh

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
