In [2]:
import pandas as pd 
from sklearn.impute import KNNImputer

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test['Transported'] = False
df = pd.concat([df_train, df_test], sort = False)
df.drop(['Name', 'PassengerId'], axis = 1, inplace = True)
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [5]:
df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand = True)
df = df.drop(columns = ['Cabin'])
df['Deck'] = df['Deck'].fillna('U')
df['Num'] = df['Num'].fillna(-1)
df['Side'] = df['Side'].fillna('U')

In [6]:
df['Deck'] = df['Deck'].map({'G' : 0, 'F' : 1, 'E' : 2, 'D' : 3, 'C' : 4, 'B' : 5, 'A' : 6, 'U' : 7, 'T' : 8})
df['Side'] = df['Side'].map({'U' : -1, 'P' : 1, 'S' : 2})

In [7]:
impute_lis = ['Age', 'VIP', 'Num', 'CryoSleep', 'Side', 'Deck', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
rest = list(set(df.columns) - set(impute_lis))
df_rest = df[rest]
imp = KNNImputer()
df_imputed = imp.fit_transform(df[impute_lis])
df_imputed = pd.DataFrame(df_imputed, columns = impute_lis)
df = pd.concat([df_rest.reset_index(drop = True), df_imputed.reset_index(drop = True)], axis = 1)

In [8]:
df.isna().sum()

Destination     274
HomePlanet      288
Transported       0
Age               0
VIP               0
Num               0
CryoSleep         0
Side              0
Deck              0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
dtype: int64

In [9]:
df["HomePlanet"] = df["HomePlanet"].fillna("U")
df["Destination"] = df["Destination"].fillna("U")
category_colls = ["HomePlanet", "Destination"]
for col in category_colls:
    df = pd.concat([df, pd.get_dummies(df[col], prefix=col)], axis=1)

# Feature engineering

In [10]:
bill_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
df["amt_spent"] = df[bill_cols].sum(axis=1)
df["std_amt_spent"] = df[bill_cols].std(axis=1)
df["mean_amt_spent"] = df[bill_cols].mean(axis=1)


In [11]:
df = df.drop(columns=["Destination", "HomePlanet"])

In [12]:
df.corr()["Transported"].sort_values(ascending=False)

Transported                  1.000000
CryoSleep                    0.324373
HomePlanet_Europa            0.131977
Destination_55 Cancri e      0.083625
Deck                         0.077959
Side                         0.059872
FoodCourt                    0.034746
HomePlanet_U                 0.006403
HomePlanet_Mars              0.005643
ShoppingMall                 0.004154
Destination_PSO J318.5-22    0.000760
Destination_U               -0.000554
VIP                         -0.018720
Num                         -0.035240
Age                         -0.050520
Destination_TRAPPIST-1e     -0.072731
HomePlanet_Earth            -0.119644
std_amt_spent               -0.121173
mean_amt_spent              -0.140452
amt_spent                   -0.140452
VRDeck                      -0.142783
Spa                         -0.154832
RoomService                 -0.174781
Name: Transported, dtype: float64

In [13]:
df["3_high_cols"] = df["CryoSleep"] + df["HomePlanet_Europa"] + df["Destination_55 Cancri e"]
df["3_low_cols"] = df["mean_amt_spent"] + df["amt_spent"] + df["HomePlanet_Earth"]

In [14]:
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [15]:
df_train, df_test = df[:df_train.shape[0]], df[df_train.shape[0]:]
df_test = df_test.drop(columns="Transported")
df_train.shape, df_test.shape

((8693, 25), (4277, 24))

In [16]:
X = df_train.drop(columns="Transported")
y = df_train["Transported"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model_lgr = LogisticRegression()
model_dtc = DecisionTreeClassifier()
model_rfc = RandomForestClassifier()
model_xgb = XGBClassifier()
model_lgbm = LGBMClassifier()
models = [model_lgr, model_dtc, model_rfc, model_xgb, model_lgbm]

In [17]:
accuracies = []
for model in models:
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    accuracies.append(accuracy_score(y_test, pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001442 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2703
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230


In [18]:
accuracies

[0.7705577918343876,
 0.7429557216791259,
 0.7918343875790684,
 0.7941345600920069,
 0.79700977573318]

In [19]:
df_dummy = pd.read_csv("test.csv")
pred = model_lgbm.predict(df_test)

final = pd.DataFrame()
final["PassengerId"] = df_dummy["PassengerId"]
final["Transported"] = pred

final.to_csv("wilden_submission.csv", index=False)