In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
train=pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test=pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)

In [None]:
columns=['HomePlanet','CryoSleep','VIP','Destination']
for col in columns:
    train[col].fillna(train[col].mode()[0], inplace=True)
    test[col].fillna(test[col].mode()[0], inplace=True)

In [None]:
train['Age'].fillna(train['Age'].median(), inplace=True)
test['Age'].fillna(test['Age'].median(), inplace=True)

In [None]:
a=['RoomService','Spa','FoodCourt','ShoppingMall','VRDeck']
for i in a:
    train[i].fillna(train[i].mean(), inplace=True)
    test[i].fillna(test[i].mean(), inplace=True)

In [None]:
train=pd.get_dummies(train, columns=['HomePlanet'],dtype='int64')
test=pd.get_dummies(test, columns=['HomePlanet'],dtype='int64')
train=pd.get_dummies(train, columns=['Destination'],dtype='int64')
test=pd.get_dummies(test, columns=['Destination'],dtype='int64')

In [None]:
b=['CryoSleep','VIP']
le=LabelEncoder()
for i in b:
    train[i]=le.fit_transform(train[i])
    test[i]=le.fit_transform(test[i])
train['Transported']=le.fit_transform(train['Transported'])

In [None]:
train['CryoSleep']=train['CryoSleep'].astype('int64')
test['CryoSleep']=test['CryoSleep'].astype('int64')

In [None]:
train[['cabin_code', 'cabin_number', 'cabin_location']] = train['Cabin'].str.split('/', expand=True)
test[['cabin_code', 'cabin_number', 'cabin_location']] = test['Cabin'].str.split('/', expand=True)

In [None]:
train['cabin_code'].fillna('U', inplace=True) 
train['cabin_number'] = train['cabin_number'].astype(float)
test['cabin_code'].fillna('U', inplace=True)
test['cabin_number'] = test['cabin_number'].astype(float)
train['cabin_number'].fillna(train['cabin_number'].median(), inplace=True)
test['cabin_number'].fillna(test['cabin_number'].median(), inplace=True)

In [None]:
train.drop('Cabin', axis=1, inplace=True)
test.drop('Cabin', axis=1, inplace=True)
train.drop('cabin_location', axis=1, inplace=True)
test.drop('cabin_location', axis=1, inplace=True)

In [None]:
train['cabin_code']=le.fit_transform(train['cabin_code'])
test['cabin_code']=le.fit_transform(test['cabin_code'])

In [None]:
scaler=StandardScaler()
train['Age']=scaler.fit_transform(train['Age'].values.reshape(-1,1))
test['Age']=scaler.fit_transform(test['Age'].values.reshape(-1,1))
for i in a:
    train[i]=scaler.fit_transform(train[i].values.reshape(-1,1))
    test[i]=scaler.fit_transform(test[i].values.reshape(-1,1))

In [None]:
for i in train.columns:
    sns.boxplot(train[i])
    plt.title(i)
    plt.show()

In [None]:
for i in test.columns:
    sns.boxplot(test[i])
    plt.title(i)
    plt.show()

In [None]:
pt=PowerTransformer(method='yeo-johnson', standardize=True)
c=['RoomService','Spa','FoodCourt','ShoppingMall','VRDeck','Age','VIP','HomePlanet_Earth','HomePlanet_Europa', 'HomePlanet_Mars', 'Destination_55 Cancri e',
    'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e']
for i in c:
    train[i]=pt.fit_transform(train[i].values.reshape(-1,1))
    test[i]=pt.fit_transform(test[i].values.reshape(-1,1))

In [None]:
sample=train['PassengerId']
sample1=test['PassengerId']
train.drop('PassengerId', axis=1, inplace=True)
test.drop('PassengerId', axis=1, inplace=True)
x=train.drop('Transported', axis=1)
y=train['Transported']

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y, test_size=0.2, random_state=42)

In [None]:
rf_classifier = RandomForestClassifier()
param_grid = {'n_estimators': [50, 100, 200],'max_depth': [None, 10, 20],'min_samples_split': [2, 5, 10],'min_samples_leaf': [1, 2, 4]}
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, scoring='neg_log_loss', n_jobs=-1)
grid_search.fit(x_train, y_train)
print("Best Hyperparameters:", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_

In [None]:
y_pred=best_rf_model.predict(x_test)
print('Accuracy Score:', accuracy_score(y_test,y_pred))
print('Confusion Matrix:', confusion_matrix(y_test,y_pred))
print('Classification Report:', classification_report(y_test,y_pred))

In [None]:
y_pred_test=best_rf_model.predict(test)
rf2=best_rf_model.predict_proba(test)

In [None]:
xg_classifier = XGBClassifier()
param_grid = {'n_estimators': [50, 100, 200],'max_depth': [None, 10, 20],'min_samples_split': [2, 5, 10],'min_samples_leaf': [1, 2, 4]}
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, scoring='neg_log_loss', n_jobs=-1)
grid_search.fit(x_train, y_train)
print("Best Hyperparameters:", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_

In [None]:
y_pred_xg=best_rf_model.predict(x_test)
print('Accuracy Score:', accuracy_score(y_test,y_pred_xg))
print('Confusion Matrix:', confusion_matrix(y_test,y_pred_xg))
print('Classification Report:', classification_report(y_test,y_pred_xg))

In [None]:
y_pred_test_xg=best_rf_model.predict(test)
rf_xg=best_rf_model.predict_proba(test)

In [None]:
y_pred_test = pd.Series(y_pred_test)
y_pred_test = y_pred_test.map({1:'True', 0:'False'})
y_pred_test = y_pred_test.to_list()

In [None]:
submission=pd.DataFrame({'PassengerId':sample1, 'Transported':y_pred_test})
submission.to_csv('submission.csv', index=False)