In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer

In [29]:
df_train=pd.read_csv("train.csv")
df_test=pd.read_csv("test.csv")
df_test['Transported']= False
df=pd.concat([df_train,df_test],sort=False)
df.head() 

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [30]:
df.shape[0] == df_train.shape[0] + df_test.shape[0]

True

In [31]:
df.isna().sum()


PassengerId       0
HomePlanet      288
CryoSleep       310
Cabin           299
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Name            294
Transported       0
dtype: int64

In [32]:
df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand = True)
df = df.drop(columns = ['Cabin'])
df['Deck'] = df['Deck'].fillna('U')
df['Num'] = df['Num'].fillna(-1)
df['Side'] = df['Side'].fillna('U')


In [33]:
df['Destination'].value_counts()


Destination
TRAPPIST-1e      8871
55 Cancri e      2641
PSO J318.5-22    1184
Name: count, dtype: int64

In [34]:
df['Deck'] = df['Deck'].map({'G' : 0, 'F' : 1, 'E' : 2, 'D' : 3, 'C' : 4, 'B' : 5, 'A' : 6, 'U' : 7, 'T' : 8})
df['Side'] = df['Side'].map({'U' : -1, 'P' : 1, 'S' : 2})

In [35]:
impute_lis = ['Age', 'VIP', 'Num', 'CryoSleep', 'Side', 'Deck', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
rest = list(set(df.columns) - set(impute_lis))
df_rest = df[rest]
imp = KNNImputer()
df_imputed = imp.fit_transform(df[impute_lis])
df_imputed = pd.DataFrame(df_imputed, columns = impute_lis)
df = pd.concat([df_rest.reset_index(drop = True), df_imputed.reset_index(drop = True)], axis = 1)

In [36]:
df['HomePlanet'] = df['HomePlanet'].fillna('U')
df['Destination'] = df['Destination'].fillna('U')
category_colls = ['HomePlanet', 'Destination']

for col in category_colls:
    df = pd.concat([df, pd.get_dummies(df[col], prefix = col)], axis = 1)

In [37]:
df = df.drop(columns = category_colls)

In [38]:
df.head()

Unnamed: 0,PassengerId,Name,Transported,Age,VIP,Num,CryoSleep,Side,Deck,RoomService,...,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_U,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_U
0,0001_01,Maham Ofracculy,False,39.0,0.0,0.0,0.0,1.0,5.0,0.0,...,0.0,0.0,False,True,False,False,False,False,True,False
1,0002_01,Juanna Vines,True,24.0,0.0,0.0,0.0,2.0,1.0,109.0,...,549.0,44.0,True,False,False,False,False,False,True,False
2,0003_01,Altark Susent,False,58.0,1.0,0.0,0.0,2.0,6.0,43.0,...,6715.0,49.0,False,True,False,False,False,False,True,False
3,0003_02,Solam Susent,False,33.0,0.0,0.0,0.0,2.0,6.0,0.0,...,3329.0,193.0,False,True,False,False,False,False,True,False
4,0004_01,Willy Santantines,True,16.0,0.0,1.0,0.0,2.0,1.0,303.0,...,565.0,2.0,True,False,False,False,False,False,True,False


In [39]:
#feature engineering
bill_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df['amt_spent'] = df[bill_cols].sum(axis = 1)
df['std_amt_spent'] = df[bill_cols].std(axis = 1)
df['mean_amt_spent'] = df[bill_cols].mean(axis = 1)

df['3_high_cols'] = df['CryoSleep'] + df['HomePlanet_Europa'] + df['Destination_55 Cancri e']
df['3_low_cols'] = df['mean_amt_spent'] + df['amt_spent'] + df['HomePlanet_Earth']

In [40]:
# Drop non-numeric columns before calculating correlation
numeric_df = df.select_dtypes(include=[np.number])


In [41]:
# Convert Transported to numeric (True/False to 1/0)
df['Transported'] = df['Transported'].astype(int)

# Calculate correlations
numeric_df = df.select_dtypes(include=[np.number])
correlation_with_transportation = numeric_df.corr()['Transported'].sort_values(ascending=False)
print(correlation_with_transportation)

Transported       1.000000
CryoSleep         0.324373
3_high_cols       0.284177
Deck              0.077959
Side              0.059872
FoodCourt         0.034746
ShoppingMall      0.004154
VIP              -0.018720
Num              -0.035240
Age              -0.050520
std_amt_spent    -0.121173
mean_amt_spent   -0.140452
amt_spent        -0.140452
3_low_cols       -0.140476
VRDeck           -0.142783
Spa              -0.154832
RoomService      -0.174781
Name: Transported, dtype: float64


In [42]:
df_train, df_test = df[:df_train.shape[0]], df[df_train.shape[0]:]
df_test = df_test.drop(columns = 'Transported')
df_train.shape, df_test.shape

((8693, 27), (4277, 26))

In [43]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import optuna

In [44]:
X = df_train.drop(columns = 'Transported')
y = df_train['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

model_1 = LogisticRegression()
model_2 = DecisionTreeClassifier()
model_3 = RandomForestClassifier()
model_4 = CatBoostClassifier()
model_5 = LGBMClassifier()

In [45]:
# Drop non-numeric columns (like Name and PassengerId)
X = df_train.drop(columns=['Transported', 'Name', 'PassengerId'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate the model
model_1.fit(X_train, y_train)
pred = model_1.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, pred)}")

Accuracy: 0.7705577918343876


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [46]:
model_2.fit(X_train, y_train)
pred = model_2.predict(X_test)
accuracy_score(y_test, pred)

0.7498562392179413

In [47]:
model_3.fit(X_train, y_train)
pred = model_3.predict(X_test)
accuracy_score(y_test, pred)

0.7912593444508338

In [48]:
model_4.fit(X_train, y_train)  
pred = model_4.predict(X_test)
accuracy_score(y_test, pred)

Learning rate set to 0.023581
0:	learn: 0.6825987	total: 8.14ms	remaining: 8.13s
1:	learn: 0.6709345	total: 11.6ms	remaining: 5.76s
2:	learn: 0.6604858	total: 14.6ms	remaining: 4.84s
3:	learn: 0.6508216	total: 17.7ms	remaining: 4.4s
4:	learn: 0.6421986	total: 20.8ms	remaining: 4.13s
5:	learn: 0.6341608	total: 23.6ms	remaining: 3.91s
6:	learn: 0.6261888	total: 26.5ms	remaining: 3.75s
7:	learn: 0.6181032	total: 29.6ms	remaining: 3.67s
8:	learn: 0.6101802	total: 32.3ms	remaining: 3.56s
9:	learn: 0.6028956	total: 35.1ms	remaining: 3.48s
10:	learn: 0.5949287	total: 38ms	remaining: 3.42s
11:	learn: 0.5896681	total: 40.6ms	remaining: 3.34s
12:	learn: 0.5838639	total: 43.8ms	remaining: 3.32s
13:	learn: 0.5785117	total: 46.5ms	remaining: 3.28s
14:	learn: 0.5730968	total: 49.2ms	remaining: 3.23s
15:	learn: 0.5678902	total: 51.9ms	remaining: 3.19s
16:	learn: 0.5630607	total: 54.5ms	remaining: 3.15s
17:	learn: 0.5584957	total: 57.7ms	remaining: 3.15s
18:	learn: 0.5541825	total: 60.8ms	remaining: 3

0.8033352501437608

In [49]:
model_5.fit(X_train, y_train)
pred = model_5.predict(X_test)
accuracy_score(y_test, pred)

[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000612 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2703
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230


0.79700977573318

In [50]:
# Ensure df_test has the same columns as training data
df_test = df_test.drop(columns=['Name', 'PassengerId'])

# Make predictions
pred = model_5.predict(df_test)

# Create submission DataFrame
final = pd.DataFrame()
final['PassengerId'] = df_dummy['PassengerId']
final['Transported'] = pred.astype(bool)  # Convert predictions to boolean

# Save to CSV
final.to_csv('submission.csv', index=False)