In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
import category_encoders as ce
# import missingno as msno
import category_encoders as ce
from sklearn.metrics import (roc_auc_score, recall_score, f1_score, precision_score,
                             accuracy_score)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from catboost import Pool, cv
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers as L         # Уже готовые слои для моделей
from tensorflow.keras.models import Sequential   # Специальный класс для склеивания слоёв
from tensorflow.keras.models import Model        # Альтернативный класс для склейки слоёв
import tensorflow.keras.optimizers as opt        # Разные оптимизационные алгоритмы :3
from keras.wrappers.scikit_learn import KerasClassifier




import warnings
warnings.filterwarnings('ignore')

# 1 About dataset

This dataset is taken form kaggle (https://www.kaggle.com/competitions/spaceship-titanic)

train.csv - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.
  * PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
  * HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
  * CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
  * Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
  * Destination - The planet the passenger will be debarking to.
  * Age - The age of the passenger.
  * VIP - Whether the passenger has paid for special VIP service during the voyage.
RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
  * Name - The first and last names of the passenger.
  * Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [14]:

# Load train and test data
data_raw = pd.read_csv('train.csv')
data_raw_test = pd.read_csv('test.csv')

# Display train data
data_raw

In [15]:
# Check for missing values in each column
{key: data_raw[key].isna().sum() for key in data_raw.columns}

In [16]:
# Make copies of the original data
data_pre = data_raw.copy()
data_pre_test = data_raw_test.copy()

# Fill missing values in 'Age' column with median
median_age = data_pre['Age'].median()
median_age_test = data_pre_test['Age'].median()

data_pre['Age'].fillna(median_age, inplace=True)
data_pre_test['Age'].fillna(median_age_test, inplace=True)

def grouping_rule(data_pre, data_column):
    """
    Function to fill missing values in categorical columns based on grouping rules.
    """
    groups_with_nan = data_pre[(data_pre[data_column].isna() == True) & (data_pre['grid'] > 1)]['grp']
    groups_with_nan = list(groups_with_nan)
    for i in range(len(groups_with_nan)):
        planet_group_qty = set(data_pre[data_pre['grp'] == groups_with_nan[i]][data_column])
        planet_group_qty = list({x for x in planet_group_qty if x == x})
        if len(planet_group_qty) == 0:
            planet_fill_in = data_pre[:][data_column].mode()[0]
            data_pre.loc[data_pre['grp'] == groups_with_nan[i], data_column] = \
                data_pre[data_pre['grp'] == groups_with_nan[i]][data_column].replace(np.nan, planet_fill_in)
        else:
            planet_fill_in = data_pre[data_pre['grp'] == groups_with_nan[i]][data_column].mode()[0]
            data_pre.loc[data_pre['grp'] == groups_with_nan[i], data_column] = \
                data_pre[data_pre['grp'] == groups_with_nan[i]][data_column].replace(np.nan, planet_fill_in)
        print(data_pre.loc[data_pre['grp'] == groups_with_nan[i], data_column])

    planet_fill_in_for_df = data_pre[:][data_column].mode()[0]
    data_pre[data_column] = data_pre[data_column].replace(np.nan, planet_fill_in_for_df)

def ohe_data(data_pre, column):
    """
    Function to perform one-hot encoding for categorical columns.
    """
    for i in column:
        one_hot = pd.get_dummies(data_pre[i])
        data_pre = data_pre.join(one_hot)
        data_pre = data_pre.drop(i, axis=1)
    return data_pre

# Split 'PassengerId' column into 'grp' and 'grid'
data_pre[['grp', 'grid']] = data_pre['PassengerId'].str.split('_', expand=True)
data_pre['grp'] = data_pre['grp'].astype(int)
data_pre['grid'] = data_pre['grid'].astype(int)

# Apply grouping rule for missing values in categorical columns
grouping_rule(data_pre, data_column='HomePlanet')
grouping_rule(data_pre, data_column='Destination')
grouping_rule(data_pre, data_column='Cabin')
grouping_rule(data_pre, data_column='VIP')
grouping_rule(data_pre, data_column='CryoSleep')

# Extract 'deck', 'num', and 'side' from 'Cabin' column and drop unnecessary columns
data_pre[['deck', 'num', 'side']] = data_pre['Cabin'].str.split('/', expand=True)
data_pre.drop(['PassengerId', 'Name', 'Cabin'], axis=1, inplace=True)

# Check for missing values after data preprocessing
{key: data_pre[key].isna().sum() for key in data_pre.columns}



In [17]:
# Repeat the same preprocessing steps for test data
data_pre_test[['grp', 'grid']] = data_pre_test['PassengerId'].str.split('_', expand=True)
data_pre_test['grp'] = data_pre_test['grp'].astype(int)
data_pre_test['grid'] = data_pre_test['grid'].astype(int)

grouping_rule(data_pre=data_pre_test, data_column='HomePlanet')
grouping_rule(data_pre=data_pre_test, data_column='Destination')
grouping_rule(data_pre=data_pre_test, data_column='Cabin')
grouping_rule(data_pre=data_pre_test, data_column='VIP')
grouping_rule(data_pre=data_pre_test, data_column='CryoSleep')

data_pre_test[['deck', 'num', 'side']] = data_pre_test['Cabin'].str.split('/', expand=True)
data_pre_test.drop(['PassengerId', 'Name', 'Cabin'], axis=1, inplace=True)

# 2 Model Building
Optimization for various metrics
Different types of cross-validation
Different preprocessing techniques for categorical features
Various hyperparameter tuning methods
Feature selection
KNN, linear models, linear models with regularization, ensembles (random forest, 3 types of boosting algorithms from 3 different companies)

In [28]:
# Splitting the data into features (X) and target variable (y)
y = data_pre['Transported']
X = data_pre.drop(['Transported'], axis=1)

# Splitting the data into training and holdout sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.3, random_state=17)

# Defining categorical and numeric features
categorical_features = ['deck', 'side', 'HomePlanet', 'Destination']
numeric_features = [i for i in X_train.columns if i not in categorical_features]

# Preprocessing pipelines for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', ce.OneHotEncoder(use_cat_names=True))
])

# ColumnTransformer to apply preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Preprocessing the training and holdout data
X_train_prep = preprocessor.fit_transform(X_train)
X_holdout_prep = preprocessor.transform(X_holdout)

# Defining models and hyperparameters for RandomizedSearchCV

# Logistic Regression
param_dict_logistic = {'clf__C': np.linspace(0.01, 10, 1000)}

# Random Forest
param_dict_rndforest = {'clf__max_depth': np.arange(1, 10),
                        'clf__min_samples_leaf': np.arange(1, 10),
                        'clf__n_estimators': [100, 200, 300]
                        }

# KNN
param_dist_knn = {'clf__n_neighbors': np.arange(1, 20),
                  'clf__p': np.arange(1, 5)
                  }

# CatBoost
param_dict_catboost = {
    'clf__n_estimators': [100, 200, 300],  # Define n_estimators for the CatBoostClassifier
    'clf__max_depth': np.arange(1, 10),
    'clf__learning_rate': np.linspace(0.01, 0.3, 10),
    'clf__l2_leaf_reg': np.linspace(0.01, 0.5, 10),
    'clf__min_data_in_leaf': np.arange(1, 10)
}

# XGBoost
param_dict_xgb = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': np.arange(1, 10),
    'clf__learning_rate': np.linspace(0.01, 0.3, 10),
    'clf__reg_lambda': np.linspace(0.01, 0.5, 10),
    'clf__min_child_weight': np.arange(1, 10)
}


# Neural Network

def get_new_model():
    ###########################################################
    # Your code goes here!
    model = Sequential(name='Archibald')  # Models can be named!

    # Add the first layer with 25 neurons
    model.add(L.Dense(25, input_dim=X_train_prep.shape[1], kernel_initializer='random_normal'))

    # Add activation function to the first layer
    model.add(L.Activation('sigmoid'))

    # Add another layer with 25 neurons
    model.add(L.Dense(25, kernel_initializer='random_normal'))
    model.add(L.Activation('sigmoid'))

    # The output layer should produce probabilities for different classes
    # Softmax activation function allows this transformation
    # There will be 4 probabilities for the number of classes
    model.add(L.Dense(2, activation='softmax', kernel_initializer='random_normal'))
    ###########################################################

    # Use Adam optimization algorithm
    # It's a special gradient descent, we'll discuss it next time
    optimizer = opt.Adam(lr=1e-3)

    # Compile the model
    model.compile(loss='categorical_crossentropy',
                  metrics=["accuracy"],
                  optimizer=optimizer)

    return model


clf_neuron = KerasClassifier(build_fn=get_new_model)


param_dict_neural = {
    'clf__batch_size': [10, 20, 40, 60, 80, 100],  # Define batch_size for the KerasClassifier
    'clf__epochs': [10, 50, 100]  # Define epochs for the KerasClassifier
}



pipe_neuron = Pipeline([('scl', preprocessor),
                        ('clf', clf_neuron)
                        ])  # Pipeline with all steps


# Defining pipelines for models
pipe_logistic = Pipeline([('scl', preprocessor), ('clf', LogisticRegression(penalty="l2", solver='liblinear'))])
pipe_rndforest = Pipeline([('scl', preprocessor), ('clf', RandomForestClassifier(random_state=13))])
pipe_knn = Pipeline([('scl', preprocessor), ('clf', KNeighborsClassifier())])
pipe_catboost = Pipeline([('scl', preprocessor), ('clf', CatBoostClassifier(random_state=13))])
pipe_xgb = Pipeline([('scl', preprocessor), ('clf', XGBClassifier(random_state=13))])
pipe_neuron = Pipeline([
    ('scl', preprocessor),
    ('clf', clf_neuron)
])




# RandomizedSearchCV for each model

skf = StratifiedKFold(n_splits=5)  # Define StratifiedKFold object with 5 splits

logistic_randomized_pipe = RandomizedSearchCV(estimator=pipe_logistic, param_distributions=param_dict_logistic,
                                              cv=skf, n_iter=30, n_jobs=-1)

rndforest_randomized_pipe = RandomizedSearchCV(estimator=pipe_rndforest, param_distributions=param_dict_rndforest,
                                               cv=skf, n_iter=30, n_jobs=-1)

knn_randomized_pipe = RandomizedSearchCV(estimator=pipe_knn, param_distributions=param_dist_knn,
                                         cv=skf, n_iter=30, n_jobs=-1)

catboost_randomized_pipe = RandomizedSearchCV(estimator=pipe_catboost, param_distributions=param_dict_catboost,
                                              cv=skf, n_iter=30, n_jobs=-1)

xgb_randomized_pipe = RandomizedSearchCV(estimator=pipe_xgb, param_distributions=param_dict_xgb,
                                         cv=skf, n_iter=30, n_jobs=-1)

neuron_randomized_pipe = GridSearchCV(estimator=pipe_neuron, param_grid=param_dict_neural,
                                      cv=skf, n_jobs=-1)


In [32]:
# Fitting the models
neuron_randomized_pipe.fit(X_train, y_train)
logistic_randomized_pipe.fit(X_train, y_train)
rndforest_randomized_pipe.fit(X_train, y_train)
knn_randomized_pipe.fit(X_train, y_train)
catboost_randomized_pipe.fit(X_train, y_train)
xgb_randomized_pipe.fit(X_train, y_train)



In [40]:
# Displaying the results
models_quality = pd.DataFrame(columns=['Name', 'accuracy_score', 'recall_score', 'f1_score', 'precision_score'])

models_names = [neuron_randomized_pipe, logistic_randomized_pipe, rndforest_randomized_pipe, knn_randomized_pipe,
                catboost_randomized_pipe, xgb_randomized_pipe]

models_string = ['neuron_randomized_pipe',
                 'logistic_randomized_pipe', 'rndforest_randomized_pipe', 'knn_randomized_pipe',
                 'catboost_randomized_pipe', 'xgb_randomized_pipe']

models_quality_list = []

for i in range(len(models_names)):
    model = models_names[i]
    log_pred = model.predict(X_holdout)

    # Convert string representations of boolean values to actual boolean values
    log_pred = log_pred.astype(bool)

    model_quality = {
        'Name': models_string[i],
        'accuracy_score': accuracy_score(y_holdout, log_pred),
        'recall_score': recall_score(y_holdout, log_pred),
        'f1_score': f1_score(y_holdout, log_pred),
        'precision_score': precision_score(y_holdout, log_pred)
    }
    models_quality_list.append(model_quality)

models_quality = pd.concat([models_quality, pd.DataFrame(models_quality_list)], ignore_index=True)


# Printing accuracy scores for each model
for model_name, model in zip(models_string, models_names):
    print(f"{model_name}: {accuracy_score(y_holdout, model.predict(X_holdout).astype(bool))}")

# Storing prediction values for each model
prediction_values = pd.DataFrame()
for i, model in enumerate(models_names, 1):
    prediction_values[str(i)] = model.predict(X_holdout)

# Converting boolean predictions to binary (0 or 1)
prediction_values = prediction_values.replace([True], 1)
prediction_values = prediction_values.replace([False], 0)

# Computing the sum of predictions across models and rounding to get final predictions


prediction_values = prediction_values.replace({'True': True, 'False': False})
prediction_values = prediction_values.astype(int)
prediction_values['sum'] = np.round(prediction_values.sum(axis=1) / len(models_names)).astype(int)

prediction_values


In [41]:
# Printing accuracy score for combined predictions
print('Accuracy score for combined predictions:', accuracy_score(y_holdout, prediction_values['sum']))

# Calculating accuracy score for the Random Forest model
rndforest_accuracy = accuracy_score(y_holdout, rndforest_randomized_pipe.predict(X_holdout))
print('Accuracy score for Random Forest model:', rndforest_accuracy)

# Storing predictions for the test data using the Random Forest model
prediction_test_res = pd.DataFrame()
prediction_test_res['Transported'] = rndforest_randomized_pipe.predict(data_pre_test)


In [42]:
# Create a DataFrame for test predictions with PassengerId
prediction_test = pd.DataFrame()
prediction_test['PassengerId'] = data_raw_test['PassengerId']
prediction_test['Transported'] = prediction_test_res['Transported']

# Save the DataFrame to a CSV file
prediction_test.to_csv('result.csv', index=False)

# Display the DataFrame
prediction_test
