In [31]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from datetime import datetime
from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, classification_report, accuracy_score
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.linear_model as linear_model
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import GridSearchCV, KFold
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline  # Użyj pipeline’u z imblearn


from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import os, math

import warnings
warnings.filterwarnings('ignore')

In [32]:
train = pd.read_csv('oryginalne pliki/train.csv')
test = pd.read_csv('oryginalne pliki/test.csv')
print ("Data is loaded!")
print("Train set size:", train.shape)
print("Test set size:", test.shape)

Data is loaded!
Train set size: (8693, 14)
Test set size: (4277, 13)


In [33]:
y = train.Transported.reset_index(drop=True)
train_features = train.drop(['Transported'], axis=1)
test_features = test

Merging training and testing dataset, to transform in uniformly

In [34]:
features = pd.concat([train_features, test_features]).reset_index(drop=True)

Splitting PassengerID column

In [35]:
features[['Group_no', 'Passenger_no']] = features['PassengerId'].str.split('_', expand=True)

Splitting Cabin column

In [36]:
features[['Deck', 'Cabin_no', 'Side']] = features['Cabin'].str.split('/', expand=True)

In [37]:
features.drop(['Cabin'], axis=1, inplace=True)

Splitting Name column

In [38]:
features['Surname'] = features['Name'].str.split(' ', expand=True)[1]

In [39]:
print(features.isna().sum())

PassengerId       0
HomePlanet      288
CryoSleep       310
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Name            294
Group_no          0
Passenger_no      0
Deck            299
Cabin_no        299
Side            299
Surname         294
dtype: int64


In [40]:
features

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Group_no,Passenger_no,Deck,Cabin_no,Side,Surname
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0001,01,B,0,P,Ofracculy
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,0002,01,F,0,S,Vines
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0003,01,A,0,S,Susent
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0003,02,A,0,S,Susent
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,0004,01,F,1,S,Santantines
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,9266_02,Earth,True,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter,9266,02,G,1496,S,Peter
12966,9269_01,Earth,False,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron,9269,01,,,,Scheron
12967,9271_01,Mars,True,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,9271,01,D,296,P,Pore
12968,9273_01,Europa,False,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,9273,01,D,297,P,Conale


In [41]:
categorical_columns = features.select_dtypes(include=['object']).columns.tolist()
len(categorical_columns)

12

Converts categorical values into numbers, preserving information about missing values (LabelEncoder does not handle NaN).

In [42]:
def label_encode_with_nan(series):
    # Zamień wartości na stringi, w tym NaN → 'MISSING_TEMP_VALUE'
    series_str = series.astype(str).fillna('MISSING_TEMP_VALUE')
    
    encoder = LabelEncoder()
    encoded = encoder.fit_transform(series_str)
    
    # Zakodowane wartości do Series z oryginalnym indexem
    result = pd.Series(encoded, index=series.index)
    
    # Przywróć NaN tam, gdzie były
    result[series.isna()] = np.nan
    
    return result, encoder


for col in categorical_columns:
    features[col], _ = label_encode_with_nan(features[col])

In [43]:
columns_numerical = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Age']


In [46]:
print(features.dtypes)

PassengerId       int64
HomePlanet      float64
CryoSleep       float64
Destination     float64
Age             float64
VIP             float64
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name            float64
Group_no          int64
Passenger_no      int64
Deck            float64
Cabin_no        float64
Side            float64
Surname         float64
dtype: object


Splitting train and test datasets

In [50]:
X = features.iloc[:len(y), :]
X_sub = features.iloc[len(X):, :]
print('X', X.shape, 'y', y.shape, 'X_sub', X_sub.shape)

X (8693, 18) y (8693,) X_sub (4277, 18)


ML part

In [52]:
pipeline = ImbPipeline([
    ('scaler', QuantileTransformer(output_distribution='normal')),  # Skalowanie cech do rozkładu normalnego
    ('xgb', xgb.XGBClassifier(
        objective='binary:logistic',  # Cel: klasyfikacja binarna
        random_state=42,
        
     
    ))
])


param_grid = {
    'xgb__n_estimators': [50, 100, 200],      # Liczba drzew
    'xgb__max_depth': [3, 4, 5],              # Maksymalna głębokość drzewa
    'xgb__learning_rate': [0.01, 0.1, 0.3],   # Szybkość uczenia
    'xgb__subsample': [0.7, 0.9, 1.0],         # Procent próbek używanych do treningu drzewa
    'xgb__colsample_bytree': [0.6, 0.8, 1.0]
}


grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1',                       # Metryka do oceny
    cv=KFold(n_splits=10, shuffle=True, random_state=42),  # 10-krotna walidacja krzyżowa
    n_jobs=-1,                                # Użyj wszystkich dostępnych rdzeni
    verbose=1                                 # Wyświetlaj postęp
)

grid_search.fit(X, y)

#  Display the results of GridSearchCV
print("\nNajlepsze parametry: ", grid_search.best_params_)
print("Najlepsza średnia dokładność (z walidacji krzyżowej): {:.4f}".format(grid_search.best_score_))

#  Use the best model to make predictions on the entire dataset
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X)

#  Display report
print("\nRaport klasyfikacji dla całego zestawu danych:")
print(classification_report(y, y_pred))

#  List the feature importance from the best model
feature_importance = best_model.named_steps['xgb'].feature_importances_
print("\nWażność cech:")
for i, importance in enumerate(feature_importance):
    print(f"Cecha {i+1}: {importance:.4f}")

Fitting 10 folds for each of 243 candidates, totalling 2430 fits

Najlepsze parametry:  {'xgb__colsample_bytree': 1.0, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 3, 'xgb__n_estimators': 200, 'xgb__subsample': 0.7}
Najlepsza średnia dokładność (z walidacji krzyżowej): 0.8127

Raport klasyfikacji dla całego zestawu danych:
              precision    recall  f1-score   support

       False       0.85      0.82      0.83      4315
        True       0.83      0.85      0.84      4378

    accuracy                           0.84      8693
   macro avg       0.84      0.84      0.84      8693
weighted avg       0.84      0.84      0.84      8693


Ważność cech:
Cecha 1: 0.0239
Cecha 2: 0.0838
Cecha 3: 0.4820
Cecha 4: 0.0219
Cecha 5: 0.0223
Cecha 6: 0.0101
Cecha 7: 0.0598
Cecha 8: 0.0323
Cecha 9: 0.0225
Cecha 10: 0.0526
Cecha 11: 0.0508
Cecha 12: 0.0118
Cecha 13: 0.0228
Cecha 14: 0.0112
Cecha 15: 0.0285
Cecha 16: 0.0202
Cecha 17: 0.0323
Cecha 18: 0.0113


In [54]:
prediction = best_model.predict(X_sub).astype(bool)

In [55]:
prediction

array([False, False,  True, ...,  True,  True,  True])

In [56]:
submit = pd.read_csv('oryginalne pliki/sample_submission.csv')

In [57]:
submit["Transported"] = prediction

In [58]:
submit.to_csv("new_submission.csv", index=False)

The result is pretty good. I got 80% accuracy. The second result in the table is 82%. I can still tweak it a bit...