https://www.kaggle.com/competitions/spaceship-titanic

In [81]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import HalvingRandomSearchCV
from scipy.stats import randint

In [82]:
train_data = pd.read_csv('train.csv')
final_data = pd.read_csv('test.csv')

In [83]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [84]:
train_data.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [85]:
train_data['Group'] = train_data['PassengerId'].apply(lambda x: pd.Series(int(str(x).split("_")[0])))
final_data['Group'] = final_data['PassengerId'].apply(lambda x: pd.Series(int(str(x).split("_")[0])))

In [86]:
train_data[['Deck', 'Num', 'Side']] = train_data['Cabin'].apply(lambda x: pd.Series(str(x).split("/")))
final_data[['Deck', 'Num', 'Side']] = final_data['Cabin'].apply(lambda x: pd.Series(str(x).split("/")))

In [87]:
features = ['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Group', 'Deck', 'Side']

In [88]:
for collumn in features:
    try:
        train_data[collumn].fillna(train_data[collumn].mean(), inplace = True)
        final_data[collumn].fillna(final_data[collumn].mean(), inplace = True)
    except:
        train_data[collumn].fillna(train_data[collumn].mode(), inplace = True)
        final_data[collumn].fillna(final_data[collumn].mode(), inplace = True)


In [89]:
train, test = train_test_split(train_data, test_size=0.3)

In [90]:
y_train = train['Transported']
X_train = pd.get_dummies(train[features])

y_test = test['Transported']
X_test = pd.get_dummies(test[features])

y_data = train_data['Transported']
X_data = pd.get_dummies(train_data[features])

X_final = pd.get_dummies(final_data[features])

In [93]:
param_distributions = {"n_estimators": [100, 150, 200],
                    "criterion": ["gini", "entropy"],
                    "max_depth": [8, 10, 12, 14]}

clf = RandomForestClassifier(random_state=1)

best_clf = HalvingRandomSearchCV(clf, param_distributions)

best_clf.fit(X_train.values, y_train.values)

best_clf.best_params_



{'n_estimators': 150, 'max_depth': 8, 'criterion': 'gini'}

In [94]:
print(f'Accuracy: {best_clf.score(X_test.values,y_test.values)}')

Accuracy: 0.8055981595092024


In [95]:
clf = RandomForestClassifier(random_state=1)

best_clf = HalvingRandomSearchCV(clf, param_distributions)

best_clf.fit(X_data.values, y_data.values)

best_clf.best_params_



{'n_estimators': 150, 'max_depth': 14, 'criterion': 'entropy'}

In [96]:
predictions = best_clf.predict(X_final.values)
output = pd.DataFrame({"PassengerId": final_data.PassengerId, "Transported": predictions})
output.to_csv("submission.csv", index = False)