# EDA

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
df.head()

In [None]:
df = df.drop(['PassengerId', 'Name', 'Cabin'], axis=1)
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

# Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

In [None]:
encoder = LabelEncoder()
df['HomePlanet'] = encoder.fit_transform(df['HomePlanet'])
df['CryoSleep'] = encoder.fit_transform(df['CryoSleep']) 
df['Destination'] = encoder.fit_transform(df['Destination']) 
df['VIP'] = encoder.fit_transform(df['VIP']) 
df['Transported'] = encoder.fit_transform(df['Transported']) 

In [None]:
scalar = MinMaxScaler()
df[['Age',
    'FoodCourt',
    'RoomService',
    'ShoppingMall',
    'Spa',
    'VRDeck']] = scalar.fit_transform(df[['Age',
                                          'FoodCourt',
                                          'RoomService',
                                          'ShoppingMall',
                                          'Spa',
                                          'VRDeck']])

In [None]:
columns = df.columns
imputer = KNNImputer(n_neighbors=5)
df = imputer.fit_transform(df)
df = pd.DataFrame(df, columns= columns)

In [None]:
df.info()

# Models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from tpot import TPOTClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
acc_log = round(logreg.score(X_train, y_train) * 100, 3)
log = round(logreg.score(X_test, y_test) * 100, 3)
print("Train: ", acc_log)
print("Test: ", log)

In [None]:
SVC = SVC(kernel= 'linear', C= 1)
SVC.fit(X_train, y_train)
acc_svc = round(SVC.score(X_train, y_train) * 100, 3)
svc = round(SVC.score(X_test, y_test) * 100, 3)
print("Train: ", acc_svc)
print("Test: ", svc)

In [None]:
RandomForestClassifier = RandomForestClassifier(n_estimators=100)
RandomForestClassifier.fit(X_train, y_train)
acc_random_forest = round(RandomForestClassifier.score(X_train, y_train) * 100, 3) 
random_forest = round(RandomForestClassifier.score(X_test, y_test) * 100, 3) 
print("Train: ", acc_random_forest)
print("Test: ", random_forest)

In [None]:
DecisionTreeClassifier = DecisionTreeClassifier()
DecisionTreeClassifier.fit(X_train, y_train)
acc_decision_tree = round(DecisionTreeClassifier.score(X_train, y_train) * 100, 3)
decision_tree = round(DecisionTreeClassifier.score(X_test, y_test) * 100, 3)
print("Train: ",acc_decision_tree)
print("Test: ", decision_tree)

In [None]:
KNeighborsClassifier = KNeighborsClassifier(n_neighbors = 3)
KNeighborsClassifier.fit(X_train, y_train)
acc_knn = round(KNeighborsClassifier.score(X_train, y_train) * 100, 3)
knn = round(KNeighborsClassifier.score(X_test, y_test) * 100, 3)
print("Train: ", acc_knn)
print("Test: ", knn)

In [None]:
#tpot = TPOTClassifier(verbosity=2, max_time_mins=10)
#tpot.fit(X_train, y_train)
#print("Train: ", tpot.score(X_train, y_train))
#print("Test: ", tpot.score(X_test, y_test))

In [None]:
GradientBoostingClassifier = GradientBoostingClassifier(
    learning_rate=0.01,
    max_depth=8,
    max_features=0.5,
    min_samples_leaf=17,
    min_samples_split=6,
    n_estimators=100,
    subsample=0.6,
    random_state=42
)
GradientBoostingClassifier.fit(X_train, y_train)
acc_GBC = round(GradientBoostingClassifier.score(X_train, y_train) * 100, 3)
GBC = round(GradientBoostingClassifier.score(X_test, y_test) * 100, 3)
print("Train: ", acc_GBC)
print("Test: ", GBC)

In [None]:
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [None]:
test.head()

In [None]:
test = test.drop(['PassengerId', 'Name', 'Cabin'], axis='columns')

In [None]:
test['HomePlanet'] = encoder.fit_transform(test['HomePlanet'])
test['CryoSleep'] = encoder.fit_transform(test['CryoSleep']) 
test['Destination'] = encoder.fit_transform(test['Destination']) 
test['VIP'] = encoder.fit_transform(test['VIP']) 

In [None]:
test[['Age',
      'FoodCourt',
      'RoomService',
      'ShoppingMall',
      'Spa',
      'VRDeck']] = scalar.fit_transform(test[['Age',
                                              'FoodCourt',
                                              'RoomService',
                                              'ShoppingMall',
                                              'Spa',
                                              'VRDeck']])

In [None]:
from sklearn.impute import KNNImputer
columns = test.columns
imputer = KNNImputer(n_neighbors=2)
test = imputer.fit_transform(test)
test = pd.DataFrame(test, columns= columns)

In [None]:
predictions = GradientBoostingClassifier.predict(test)
predictions = predictions.astype(bool)

In [None]:
test_df = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Transported': predictions})
output.to_csv('/kaggle/working/submission.csv', index=False)