In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler,StandardScaler, LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix,mean_squared_error,accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')

In [3]:
test.isna().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

In [4]:
train.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [5]:
df = pd.merge(train, test, how='outer')
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


###File and Data Field Descriptions

***train.csv*** - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.

***PassengerId*** - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

***HomePlanet*** - The planet the passenger departed from, typically their planet of permanent residence.

***CryoSleep*** - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

***Cabin*** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

***Destination*** - The planet the passenger will be debarking to.

***Age*** - The age of the passenger.

***VIP*** - Whether the passenger has paid for special VIP service during the voyage.

***RoomService, FoodCourt, ShoppingMall, Spa, VRDeck*** - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

***Name*** - The first and last names of the passenger.

***Transported*** - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,12700.0,28.772,14.387,0.0,19.0,27.0,38.0,79.0
RoomService,12707.0,222.898,647.597,0.0,0.0,0.0,49.0,14327.0
FoodCourt,12681.0,451.962,1584.371,0.0,0.0,0.0,77.0,29813.0
ShoppingMall,12664.0,174.906,590.559,0.0,0.0,0.0,29.0,23492.0
Spa,12686.0,308.477,1130.28,0.0,0.0,0.0,57.0,22408.0
VRDeck,12702.0,306.789,1180.097,0.0,0.0,0.0,42.0,24133.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12682 non-null  object 
 2   CryoSleep     12660 non-null  object 
 3   Cabin         12671 non-null  object 
 4   Destination   12696 non-null  object 
 5   Age           12700 non-null  float64
 6   VIP           12674 non-null  object 
 7   RoomService   12707 non-null  float64
 8   FoodCourt     12681 non-null  float64
 9   ShoppingMall  12664 non-null  float64
 10  Spa           12686 non-null  float64
 11  VRDeck        12702 non-null  float64
 12  Name          12676 non-null  object 
 13  Transported   8693 non-null   object 
dtypes: float64(6), object(8)
memory usage: 1.4+ MB


In [8]:
df.isnull().sum()

PassengerId        0
HomePlanet       288
CryoSleep        310
Cabin            299
Destination      274
Age              270
VIP              296
RoomService      263
FoodCourt        289
ShoppingMall     306
Spa              284
VRDeck           268
Name             294
Transported     4277
dtype: int64

In [9]:
numeric = df.select_dtypes(include=['number']).columns
print(df[numeric].quantile([0,0.01,0.05,0.25,0.50,0.75,0.95,0.99,1]).T)

              0.000  0.010  0.050  0.250  0.500  0.750    0.950    0.990  \
Age           0.000  0.000  4.000 19.000 27.000 38.000   56.000   65.010   
RoomService   0.000  0.000  0.000  0.000  0.000 49.000 1275.800 3009.520   
FoodCourt     0.000  0.000  0.000  0.000  0.000 77.000 2670.000 7797.600   
ShoppingMall  0.000  0.000  0.000  0.000  0.000 29.000  964.250 2396.740   
Spa           0.000  0.000  0.000  0.000  0.000 57.000 1589.000 5435.900   
VRDeck        0.000  0.000  0.000  0.000  0.000 42.000 1538.800 5838.930   

                 1.000  
Age             79.000  
RoomService  14327.000  
FoodCourt    29813.000  
ShoppingMall 23492.000  
Spa          22408.000  
VRDeck       24133.000  


In [10]:
dff = df.copy()