In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score

In [None]:
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

train_df.head()


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [None]:
missing_value_train = train_df.isnull().sum()
missing_value_test = test_df.isnull().sum()
print("Missing value in the train data")
print(missing_value_train)
print("\nMissing value in the test data")
print(missing_value_test)

Missing value in the train data
PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

Missing value in the test data
PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64


In [None]:
for column in train_df.columns:
    value_counts = train_df[column].value_counts()
    print(f"Unique values in {column}: {value_counts}\n")

Unique values in PassengerId: PassengerId
0001_01    1
6136_01    1
6141_01    1
6139_06    1
6139_05    1
          ..
3126_01    1
3124_03    1
3124_02    1
3124_01    1
9280_02    1
Name: count, Length: 8693, dtype: int64

Unique values in HomePlanet: HomePlanet
Earth     4602
Europa    2131
Mars      1759
Name: count, dtype: int64

Unique values in CryoSleep: CryoSleep
False    5439
True     3037
Name: count, dtype: int64

Unique values in Cabin: Cabin
G/734/S     8
G/109/P     7
B/201/P     7
G/1368/P    7
G/981/S     7
           ..
G/556/P     1
E/231/S     1
G/545/S     1
G/543/S     1
F/947/P     1
Name: count, Length: 6560, dtype: int64

Unique values in Destination: Destination
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: count, dtype: int64

Unique values in Age: Age
24.0    324
18.0    320
21.0    311
19.0    293
23.0    292
       ... 
72.0      4
78.0      3
79.0      3
76.0      2
77.0      2
Name: count, Length: 80, dtype: int64

Unique value

In [None]:
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())
train_df['RoomService'] = train_df['RoomService'].fillna(train_df['RoomService'].mean())
train_df['FoodCourt'] = train_df['FoodCourt'].fillna(train_df['FoodCourt'].mean())
train_df['ShoppingMall'] = train_df['ShoppingMall'].fillna(train_df['ShoppingMall'].mean())
train_df['Spa'] = train_df['Spa'].fillna(train_df['Spa'].mean())
train_df['VRDeck'] = train_df['VRDeck'].fillna(train_df['VRDeck'].mean())

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8693 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8693 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [None]:
test_df['Age'] = test_df['Age'].fillna(train_df['Age'].mean())
test_df['RoomService'] = test_df['RoomService'].fillna(train_df['RoomService'].mean())
test_df['FoodCourt'] = test_df['FoodCourt'].fillna(train_df['FoodCourt'].mean())
test_df['ShoppingMall'] = test_df['ShoppingMall'].fillna(train_df['ShoppingMall'].mean())
test_df['Spa'] = test_df['Spa'].fillna(train_df['Spa'].mean())
test_df['VRDeck'] = test_df['VRDeck'].fillna(train_df['VRDeck'].mean())

In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4277 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4277 non-null   float64
 8   FoodCourt     4277 non-null   float64
 9   ShoppingMall  4277 non-null   float64
 10  Spa           4277 non-null   float64
 11  VRDeck        4277 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [None]:
data_df = pd.concat([train_df, test_df], axis=0)
#the 'conat' function is to connect the train_df and the test_df information.
data_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.00000,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.00000,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.00000,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.00000,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.00000,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.00000,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter,
4273,9269_01,Earth,False,,TRAPPIST-1e,42.00000,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron,
4274,9271_01,Mars,True,D/296/P,55 Cancri e,28.82793,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,
4275,9273_01,Europa,False,D/297/P,,28.82793,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,


In [None]:
data_df = data_df.set_index('PassengerId')
data_df

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.00000,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.00000,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.00000,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.00000,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.00000,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.00000,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter,
9269_01,Earth,False,,TRAPPIST-1e,42.00000,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron,
9271_01,Mars,True,D/296/P,55 Cancri e,28.82793,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,
9273_01,Europa,False,D/297/P,,28.82793,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,


In [None]:
data_new = data_df.drop(['Cabin', 'Name'], axis=1)
cat_col = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Transported']
num_col = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
data_new = pd.get_dummies(data_new, columns=cat_col, dtype=int)    #dtype set the return value is 0 or 1 not True or False
data_new                                                           #pd.dummies() the function is like OneHotcoder
data_new['CryoSleep'] = data_new['CryoSleep_True']
data_new['VIP'] = data_new['VIP_True']
data_new['Transported'] = data_new['Transported_True']
data_new.drop(['CryoSleep_True', 'CryoSleep_False', 'VIP_True', 'VIP_False', 'Transported_True', 'Transported_False'], axis=1, inplace=True)   #'inplace=True' indicates that use the new to substitute the origin.
data_new

Unnamed: 0_level_0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,CryoSleep,VIP,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0001_01,39.00000,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,1,0,0,0
0002_01,24.00000,109.0,9.0,25.0,549.0,44.0,1,0,0,0,0,1,0,0,1
0003_01,58.00000,43.0,3576.0,0.0,6715.0,49.0,0,1,0,0,0,1,0,1,0
0003_02,33.00000,0.0,1283.0,371.0,3329.0,193.0,0,1,0,0,0,1,0,0,0
0004_01,16.00000,303.0,70.0,151.0,565.0,2.0,1,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9266_02,34.00000,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,1,1,0,0
9269_01,42.00000,0.0,847.0,17.0,10.0,144.0,1,0,0,0,0,1,0,0,0
9271_01,28.82793,0.0,0.0,0.0,0.0,0.0,0,0,1,1,0,0,1,0,0
9273_01,28.82793,0.0,2680.0,0.0,0.0,523.0,0,1,0,0,0,0,0,0,0


In [None]:
from sklearn.preprocessing import StandardScaler # Import StandardScaler

In [None]:
data_std = data_new.copy()
scaler = StandardScaler()
data_std[num_col] = scaler.fit_transform(data_std[num_col])
data_std[num_col].describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,12970.0,12970.0,12970.0,12970.0,12970.0,12970.0
mean,4.7113860000000004e-17,8.327101000000001e-17,1.2052380000000001e-17,1.3148050000000001e-17,-2.1913420000000002e-17,-3.081575e-17
std,1.000039,1.000039,1.000039,1.000039,1.000039,1.000039
min,-2.02113,-0.3478064,-0.2885931,-0.2996915,-0.2760219,-0.262674
25%,-0.6162569,-0.3478064,-0.2885931,-0.2996915,-0.2760219,-0.262674
50%,-0.1245514,-0.3478064,-0.2885931,-0.2996915,-0.2760219,-0.262674
75%,0.5778851,-0.224556,-0.208162,-0.2191469,-0.1981899,-0.2078699
max,3.528118,22.00418,18.7423,39.9589,19.77063,20.40276


In [None]:
train = data_std[0:len(train_df)]
test = data_std[len(train_df):]
train

Unnamed: 0_level_0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,CryoSleep,VIP,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0001_01,0.718372,-0.347806,-0.288593,-0.299692,-0.276022,-0.262674,0,1,0,0,0,1,0,0,0
0002_01,-0.335282,-0.177752,-0.282848,-0.256849,0.215125,-0.224996,1,0,0,0,0,1,0,0,1
0003_01,2.053002,-0.280721,1.994118,-0.299692,5.731354,-0.220715,0,1,0,0,0,1,0,1,0
0003_02,0.296911,-0.347806,0.530400,0.336097,2.702170,-0.097405,0,1,0,0,0,1,0,0,0
0004_01,-0.897231,0.124913,-0.243909,-0.040921,0.229439,-0.260961,1,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,0.858860,-0.347806,4.064261,-0.299692,1.193840,-0.199307,0,1,0,1,0,0,0,1,0
9278_01,-0.756744,-0.347806,-0.288593,-0.299692,-0.276022,-0.262674,1,0,0,0,1,0,1,0,0
9279_01,-0.194795,-0.347806,-0.288593,2.908383,-0.275127,-0.262674,1,0,0,0,0,1,0,0,1
9280_01,0.226667,-0.347806,0.381028,-0.299692,0.039779,2.507503,0,1,0,1,0,0,0,0,0


In [None]:
test

Unnamed: 0_level_0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,CryoSleep,VIP,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0013_01,-0.124551,-0.347806,-0.288593,-0.299692,-0.276022,-0.262674,1,0,0,0,0,1,1,0,0
0018_01,-0.686501,-0.347806,-0.282848,-0.299692,2.249492,-0.262674,1,0,0,0,0,1,0,0,0
0019_01,0.156423,-0.347806,-0.288593,-0.299692,-0.276022,-0.262674,0,1,0,1,0,0,1,0,0
0021_01,0.648129,-0.347806,3.957658,-0.299692,-0.114096,0.238270,0,1,0,0,0,1,0,0,0
0023_01,-0.616257,-0.332205,-0.288593,0.788518,-0.276022,-0.262674,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9266_02,0.367154,-0.347806,-0.288593,-0.299692,-0.276022,-0.262674,1,0,0,0,0,1,1,0,0
9269_01,0.929103,-0.347806,0.252083,-0.270558,-0.267076,-0.139365,1,0,0,0,0,1,0,0,0
9271_01,0.003849,-0.347806,-0.288593,-0.299692,-0.276022,-0.262674,0,0,1,1,0,0,1,0,0
9273_01,0.003849,-0.347806,1.422164,-0.299692,-0.276022,0.185178,0,1,0,0,0,0,0,0,0


In [None]:
y = train['Transported']
X = train.drop(['Transported'], axis=1)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42 ,
                                                    shuffle = True)

In [None]:
# Define the models
models = {
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression(),
    'K-Neighbors Classifier': KNeighborsClassifier()
}

In [None]:
# Define the hyperparameter tuning space for each model
param_grids = {
    'Random Forest': {'n_estimators': [50, 100, 200]},
    'Decision Tree': {'max_depth': [3, 5, 7]},
    'Logistic Regression': {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2'], 'solver': ['liblinear', 'saga']},
    'K-Neighbors Classifier': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
}

In [None]:
# Perform hyperparameter tuning and training for each model
for model_name, model in models.items():
    print(f"Training {model_name}...")
    grid_search = GridSearchCV(model, param_grids[model_name], cv=5, scoring='accuracy')
    grid_search.fit(x_train, y_train)
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Score: {grid_search.best_score_:.4f}")
    print()

Training Random Forest...
Best Parameters: {'n_estimators': 200}
Best Score: 0.7918

Training Decision Tree...
Best Parameters: {'max_depth': 7}
Best Score: 0.7847

Training Logistic Regression...




Best Parameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best Score: 0.7899

Training K-Neighbors Classifier...
Best Parameters: {'n_neighbors': 7, 'weights': 'uniform'}
Best Score: 0.7716



In [None]:
# Create a stacking classifier with the tuned models
estimators = [
    ('Random Forest', models['Random Forest']),
    ('Decision Tree', models['Decision Tree']),
    ('Logistic Regression', models['Logistic Regression']),
    ('K-Neighbors Classifier', models['K-Neighbors Classifier'])
]

In [None]:
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier())

In [None]:
stacking_clf.fit(x_train, y_train)

In [None]:
models['Random Forest'].fit(x_train, y_train)

In [None]:
y_pred = stacking_clf.predict(x_test)

In [None]:
y_pred_random = models['Random Forest'].predict(x_test)

In [None]:
# Evaluate the stacking classifier
accuracy = accuracy_score(y_test, y_pred_random)
print(f"Accuracy of stacking classifier: {accuracy:.4f}")

Accuracy of stacking classifier: 0.7665


In [None]:
y_val

PassengerId
8616_02    1
7200_01    1
7806_01    0
3018_01    0
3868_05    1
          ..
0369_01    1
7631_01    0
7022_02    0
1992_01    1
5061_01    1
Name: Transported, Length: 1739, dtype: int64

In [None]:
stacking_probs = models['Random Forest'].predict_proba(X_val)  # Get probabilities for each class
st_probs = stacking_probs[:, 1]

In [None]:
st_auc = roc_auc_score(y_val, st_probs)
print("Predict AUROC= ", st_auc)

Predict AUROC=  0.9624380876705072


In [None]:
y_test_pred = models['Random Forest'].predict(x_test)

In [None]:
y_test = test['Transported']
test = test.drop(['Transported'], axis=1)

In [None]:
y_test_pred = models['Random Forest'].predict(x_test)
y_test_pred = [bool(x) for x in y_test]

In [None]:
# Assuming 'test' originally had 'PassengerId' as its index before it was dropped
submission_df = pd.DataFrame({
    'PassengerId': range(891, 891 + len(y_test_pred)),  # Generate PassengerId based on y_test_pred length
    'Transported': y_test_pred
})

In [None]:
submission_df.to_csv("Submission3.csv", index=False)