In [73]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [74]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [75]:
train.head()


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [76]:
train.info()

<class 'pandas.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   str    
 1   HomePlanet    8492 non-null   str    
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   str    
 4   Destination   8511 non-null   str    
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   str    
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(2), str(5)
memory usage: 891.5+ KB


In [77]:
train.drop(columns=['PassengerId','Name'],axis=0,inplace=True)

In [78]:
fill_cols = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

for col in fill_cols:
    train[col] = train[col].fillna(train[col].median())

In [79]:
train.info()

<class 'pandas.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   str    
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   str    
 3   Destination   8511 non-null   str    
 4   Age           8693 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8693 non-null   float64
 7   FoodCourt     8693 non-null   float64
 8   ShoppingMall  8693 non-null   float64
 9   Spa           8693 non-null   float64
 10  VRDeck        8693 non-null   float64
 11  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(2), str(3)
memory usage: 755.7+ KB


In [80]:
le_cols = ['CryoSleep','VIP']

le = LabelEncoder()
for col in le_cols:
    train[col] = le.fit_transform(train[col])

In [81]:
train.info()

<class 'pandas.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   str    
 1   CryoSleep     8693 non-null   int64  
 2   Cabin         8494 non-null   str    
 3   Destination   8511 non-null   str    
 4   Age           8693 non-null   float64
 5   VIP           8693 non-null   int64  
 6   RoomService   8693 non-null   float64
 7   FoodCourt     8693 non-null   float64
 8   ShoppingMall  8693 non-null   float64
 9   Spa           8693 non-null   float64
 10  VRDeck        8693 non-null   float64
 11  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), int64(2), str(3)
memory usage: 755.7 KB


In [82]:
train.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,0,B/0/P,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,False
1,Earth,0,F/0/S,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,True
2,Europa,0,A/0/S,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,0,A/0/S,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,0,F/1/S,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,True


In [83]:
from sklearn.preprocessing import OneHotEncoder

cat_cols = ["HomePlanet", "Destination"]

for col in cat_cols:
    train[col] = train[col].fillna("Unknown")

ohe = OneHotEncoder(drop="first", sparse_output=False)
encoded = ohe.fit_transform(train[cat_cols])

encoded_df = pd.DataFrame(
    encoded,
    columns=ohe.get_feature_names_out(cat_cols)
)

train = pd.concat([train.drop(columns=cat_cols), encoded_df], axis=1)



In [84]:
train.info()

<class 'pandas.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  8693 non-null   int64  
 1   Cabin                      8494 non-null   str    
 2   Age                        8693 non-null   float64
 3   VIP                        8693 non-null   int64  
 4   RoomService                8693 non-null   float64
 5   FoodCourt                  8693 non-null   float64
 6   ShoppingMall               8693 non-null   float64
 7   Spa                        8693 non-null   float64
 8   VRDeck                     8693 non-null   float64
 9   Transported                8693 non-null   bool   
 10  HomePlanet_Europa          8693 non-null   float64
 11  HomePlanet_Mars            8693 non-null   float64
 12  HomePlanet_Unknown         8693 non-null   float64
 13  Destination_PSO J318.5-22  8693 non-null   float64
 14  Des

In [85]:
train.head()

Unnamed: 0,CryoSleep,Cabin,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_Unknown,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_Unknown
0,0,B/0/P,39.0,0,0.0,0.0,0.0,0.0,0.0,False,1.0,0.0,0.0,0.0,1.0,0.0
1,0,F/0/S,24.0,0,109.0,9.0,25.0,549.0,44.0,True,0.0,0.0,0.0,0.0,1.0,0.0
2,0,A/0/S,58.0,1,43.0,3576.0,0.0,6715.0,49.0,False,1.0,0.0,0.0,0.0,1.0,0.0
3,0,A/0/S,33.0,0,0.0,1283.0,371.0,3329.0,193.0,False,1.0,0.0,0.0,0.0,1.0,0.0
4,0,F/1/S,16.0,0,303.0,70.0,151.0,565.0,2.0,True,0.0,0.0,0.0,0.0,1.0,0.0


In [86]:
train[["Deck", "CabinNum", "Side"]] = train["Cabin"].str.split("/", expand=True)
train = train.drop(columns=["Cabin"])


In [87]:
train.info()

<class 'pandas.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  8693 non-null   int64  
 1   Age                        8693 non-null   float64
 2   VIP                        8693 non-null   int64  
 3   RoomService                8693 non-null   float64
 4   FoodCourt                  8693 non-null   float64
 5   ShoppingMall               8693 non-null   float64
 6   Spa                        8693 non-null   float64
 7   VRDeck                     8693 non-null   float64
 8   Transported                8693 non-null   bool   
 9   HomePlanet_Europa          8693 non-null   float64
 10  HomePlanet_Mars            8693 non-null   float64
 11  HomePlanet_Unknown         8693 non-null   float64
 12  Destination_PSO J318.5-22  8693 non-null   float64
 13  Destination_TRAPPIST-1e    8693 non-null   float64
 14  Des

In [88]:
train.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_Unknown,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_Unknown,Deck,CabinNum,Side
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,False,1.0,0.0,0.0,0.0,1.0,0.0,B,0,P
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,True,0.0,0.0,0.0,0.0,1.0,0.0,F,0,S
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,False,1.0,0.0,0.0,0.0,1.0,0.0,A,0,S
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,False,1.0,0.0,0.0,0.0,1.0,0.0,A,0,S
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,True,0.0,0.0,0.0,0.0,1.0,0.0,F,1,S


In [89]:
train["CabinNum"] = pd.to_numeric(train["CabinNum"], errors="coerce")
train["CabinNum"] = train["CabinNum"].fillna(train["CabinNum"].median())


In [90]:
from sklearn.preprocessing import LabelEncoder

le_side = LabelEncoder()
train["Side"] = le_side.fit_transform(train["Side"])


In [None]:
ohe_deck = OneHotEncoder(drop="first", sparse_output=False)
train["Deck"] = train["Deck"].fillna("Unknown")

deck_encoded = ohe_deck.fit_transform(train[["Deck"]])

deck_df = pd.DataFrame(
    deck_encoded,
    columns=ohe_deck.get_feature_names_out(["Deck"])
)

train = pd.concat([train.drop(columns=["Deck"]), deck_df], axis=1)

In [92]:
train.info()

<class 'pandas.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  8693 non-null   int64  
 1   Age                        8693 non-null   float64
 2   VIP                        8693 non-null   int64  
 3   RoomService                8693 non-null   float64
 4   FoodCourt                  8693 non-null   float64
 5   ShoppingMall               8693 non-null   float64
 6   Spa                        8693 non-null   float64
 7   VRDeck                     8693 non-null   float64
 8   Transported                8693 non-null   bool   
 9   HomePlanet_Europa          8693 non-null   float64
 10  HomePlanet_Mars            8693 non-null   float64
 11  HomePlanet_Unknown         8693 non-null   float64
 12  Destination_PSO J318.5-22  8693 non-null   float64
 13  Destination_TRAPPIST-1e    8693 non-null   float64
 14  Des

In [93]:
train.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_Unknown,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_Unknown,CabinNum,Side,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unknown
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,False,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,True,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,False,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,False,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,True,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [94]:
pd.set_option("display.max_columns", None)
train.info()


<class 'pandas.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  8693 non-null   int64  
 1   Age                        8693 non-null   float64
 2   VIP                        8693 non-null   int64  
 3   RoomService                8693 non-null   float64
 4   FoodCourt                  8693 non-null   float64
 5   ShoppingMall               8693 non-null   float64
 6   Spa                        8693 non-null   float64
 7   VRDeck                     8693 non-null   float64
 8   Transported                8693 non-null   bool   
 9   HomePlanet_Europa          8693 non-null   float64
 10  HomePlanet_Mars            8693 non-null   float64
 11  HomePlanet_Unknown         8693 non-null   float64
 12  Destination_PSO J318.5-22  8693 non-null   float64
 13  Destination_TRAPPIST-1e    8693 non-null   float64
 14  Des

In [95]:
with pd.option_context("display.max_columns", None):
    display(train)


Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_Unknown,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_Unknown,CabinNum,Side,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unknown
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,False,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,True,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,False,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,False,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,True,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0,41.0,1,0.0,6819.0,0.0,1643.0,74.0,False,1.0,0.0,0.0,0.0,0.0,0.0,98.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8689,1,18.0,0,0.0,0.0,0.0,0.0,0.0,False,0.0,0.0,0.0,1.0,0.0,0.0,1499.0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8690,0,26.0,0,0.0,0.0,1872.0,1.0,0.0,True,0.0,0.0,0.0,0.0,1.0,0.0,1500.0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8691,0,32.0,0,0.0,1049.0,0.0,353.0,3235.0,False,1.0,0.0,0.0,0.0,0.0,0.0,608.0,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [97]:
le_transported = LabelEncoder()
train['Transported'] = le_transported.fit_transform(train['Transported'])
X = train.drop(columns=['Transported'])
y = train['Transported']

X_train, X_test, y_train, y_test = train_test_split(
    X,y,
    test_size=0.2,
    random_state=42
)


In [99]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)


0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [102]:
from sklearn.metrics import accuracy_score

pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test,pred)


In [103]:
print(accuracy)

0.7751581368602645


In [104]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42
)

rf.fit(X_train, y_train)
preds = rf.predict(X_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, preds))


0.79700977573318


In [105]:
train["TotalSpend"] = (
    train["RoomService"] +
    train["FoodCourt"] +
    train["ShoppingMall"] +
    train["Spa"] +
    train["VRDeck"]
)
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=12,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, preds))

0.7935595169637722


In [106]:
train.info()

<class 'pandas.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  8693 non-null   int64  
 1   Age                        8693 non-null   float64
 2   VIP                        8693 non-null   int64  
 3   RoomService                8693 non-null   float64
 4   FoodCourt                  8693 non-null   float64
 5   ShoppingMall               8693 non-null   float64
 6   Spa                        8693 non-null   float64
 7   VRDeck                     8693 non-null   float64
 8   Transported                8693 non-null   int64  
 9   HomePlanet_Europa          8693 non-null   float64
 10  HomePlanet_Mars            8693 non-null   float64
 11  HomePlanet_Unknown         8693 non-null   float64
 12  Destination_PSO J318.5-22  8693 non-null   float64
 13  Destination_TRAPPIST-1e    8693 non-null   float64
 14  Des

In [107]:
from sklearn.model_selection import cross_val_score

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

scores = cross_val_score(rf, X, y, cv=5, scoring="accuracy")
print(scores)
print("Mean:", scores.mean())


[0.7573318  0.76308223 0.80333525 0.81530495 0.78941312]
Mean: 0.7856934695878945


In [108]:
rf.fit(X_train, y_train)

importance = pd.Series(
    rf.feature_importances_,
    index=X_train.columns
).sort_values(ascending=False)

print(importance.head(10))


CabinNum             0.171181
Age                  0.130994
Spa                  0.107796
VRDeck               0.095406
RoomService          0.092478
CryoSleep            0.087272
FoodCourt            0.085913
ShoppingMall         0.075708
Side                 0.022980
HomePlanet_Europa    0.020703
dtype: float64


In [109]:
train["CabinNum_bin"] = pd.qcut(train["CabinNum"], q=5, labels=False)
train["LuxurySpend"] = train["Spa"] + train["VRDeck"]
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

scores = cross_val_score(gb, X, y, cv=5)
print(scores.mean())


0.7940914153141462
