# Catboost model Spaceship Titanic kaggle

In [1]:
# импорт библиотек
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, f1_score
from IPython.display import Audio



In [2]:
# импорт данных
df_sam = pd.read_csv('sample_submission.csv')
df = pd.read_csv('train.csv')
df_pred = pd.read_csv('test.csv')
df_pred_PassengerId = df_pred['PassengerId']

## Знакомство с данными

In [3]:
print("\033[1m" + "\033[30m" + 'nan значения в основной таблице:' + "\033[0m")
print(df.isna().sum())
print('-------------------')
print("\033[1m" + "\033[30m" + 'nan значения в таблице для предсказания:' + "\033[0m")
print(df_pred.isna().sum())

[1m[30mnan значения в основной таблице:[0m
PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64
-------------------
[1m[30mnan значения в таблице для предсказания:[0m
PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64


In [4]:
print("\033[1m" + "\033[30m" + 'Типы данных  в основной таблице:' + "\033[0m")
print(df.dtypes)
print('-----------------------')
print("\033[1m" + "\033[30m" + 'Типы данных в таблице для предсказания:' + "\033[0m")
print(df_pred.dtypes)

[1m[30mТипы данных  в основной таблице:[0m
PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object
-----------------------
[1m[30mТипы данных в таблице для предсказания:[0m
PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
dtype: object


In [5]:
print("\033[1m" + "\033[30m" + 'Статистика для основных данных:' + "\033[0m")
print(df.describe())
print('-' * 74)
print("\033[1m" + "\033[30m" + 'Статистика для предсказательных данных:' + "\033[0m")
print(df_pred.describe())

[1m[30mСтатистика для основных данных:[0m
               Age   RoomService     FoodCourt  ShoppingMall           Spa  \
count  8514.000000   8512.000000   8510.000000   8485.000000   8510.000000   
mean     28.827930    224.687617    458.077203    173.729169    311.138778   
std      14.489021    666.717663   1611.489240    604.696458   1136.705535   
min       0.000000      0.000000      0.000000      0.000000      0.000000   
25%      19.000000      0.000000      0.000000      0.000000      0.000000   
50%      27.000000      0.000000      0.000000      0.000000      0.000000   
75%      38.000000     47.000000     76.000000     27.000000     59.000000   
max      79.000000  14327.000000  29813.000000  23492.000000  22408.000000   

             VRDeck  
count   8505.000000  
mean     304.854791  
std     1145.717189  
min        0.000000  
25%        0.000000  
50%        0.000000  
75%       46.000000  
max    24133.000000  
------------------------------------------------------

## Подготовка данных

### PassengerId
- Создаю два столбца 'Group' и 'Number' из столбца 'PassengerId' по разделителю '_', привожу к типу int.
- Удаляю PassengerId.
- Все действия буду проводить с обоими датасетами сразу.

In [6]:
df['Group'] = df['PassengerId'].apply(lambda x: x.split('_')[0]).astype('int')
df['Number'] = df['PassengerId'].apply(lambda x: x.split('_')[1]).astype('category')
df = df.drop('PassengerId', axis=1)

df_pred['Group'] = df_pred['PassengerId'].apply(lambda x: x.split('_')[0]).astype('int')
df_pred['Number'] = df_pred['PassengerId'].apply(lambda x: x.split('_')[1]).astype('category')
df_pred = df_pred.drop('PassengerId', axis=1)

### Transported
- Привожу к числовому типу целевую переменную.

In [7]:
# целевой признак в числовой вид
df.replace({'Transported': {False: 0, True: 1}}, inplace=True)

### 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'
- NaN значения заменяю на 0.0
- Создаю новый признак с суммой всех этих столбцов 

In [8]:
df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0.0)
df['TotalServices'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

df_pred[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = df_pred[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0.0)
df_pred['TotalServices'] = df_pred[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

### CryoSleep, HomePlanet, VIP
- Заменяю NaN на mode
- Преобразую в числовой формат.

In [9]:
columns_to = ['CryoSleep', 'HomePlanet', 'VIP']
for i in columns_to:
    df[i] = df[i].fillna(df[i].mode()[0]).astype('category')
    df_pred[i] = df_pred[i].fillna(df[i].mode()[0]).astype('category')

## Cabin
- Создаю три столба из Cabin: Deck, Num, Side.
- Удаляю Cabin.

In [10]:
df['Deck'] = df['Cabin'].apply(lambda x: x.split('/')[0] if isinstance(x, str) else x)
df['Num'] = df['Cabin'].apply(lambda x: x.split('/')[1] if isinstance(x, str) else x)
df['Side'] = df['Cabin'].apply(lambda x: x.split('/')[2] if isinstance(x, str) else x)
df.drop(columns='Cabin', inplace=True)

df_pred['Deck'] = df_pred['Cabin'].apply(lambda x: x.split('/')[0] if isinstance(x, str) else x)
df_pred['Num'] = df_pred['Cabin'].apply(lambda x: x.split('/')[1] if isinstance(x, str) else x)
df_pred['Side'] = df_pred['Cabin'].apply(lambda x: x.split('/')[2] if isinstance(x, str) else x)
df_pred.drop(columns='Cabin', inplace=True)

In [11]:
df['Deck'] = df.Deck.fillna(df.Deck.mode()[0]).astype('category')
df['Num'] = df.Num.ffill().astype('int')
df['Side'] = df.Side.fillna(df.Side.mode()[0]).astype('category')

df_pred['Deck'] = df_pred.Deck.fillna(df_pred.Deck.mode()[0]).astype('category')
df_pred['Num'] = df_pred.Num.ffill().astype('int')
df_pred['Side'] = df_pred.Side.fillna(df_pred.Side.mode()[0]).astype('category')

## Destination
- Nan заменяю на моду

In [12]:
df['Destination'] = df.Destination.fillna(df.Destination.mode()[0]).astype('category')
df_pred['Destination'] = df_pred.Destination.fillna(df_pred.Destination.mode()[0]).astype('category')

## Age
- Nan значение заменяю на сред. значение.
- Есть значения с 0
- посмотреь может медианна лучше будет.

In [13]:
df.Age.fillna(df.Age.mean(), inplace=True)
df_pred.Age.fillna(df.Age.mean(), inplace=True)

## Name
- Удаляю
- Нужно придумать как из имен сделать признак с полом

In [14]:
df.drop(columns='Name', inplace=True)
df_pred.drop(columns='Name', inplace=True)

## HasServices
- Создаем признак (были ли вообще траты)

In [15]:
df['HasServices'] = (df['TotalServices'] > 0).astype('category')
df_pred['HasServices'] = (df_pred['TotalServices'] > 0).astype('category')

## Модель

In [16]:
X = df.drop(columns='Transported')
y = df['Transported']

In [17]:
lst = []
for i in X.columns:
    if df[i].dtypes == 'category':
        lst.append(i)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# def objective(trial: optuna.Trial):
#     '''
#     Обычная сигнатура функции оптимизации для optuna.
#     '''
#     params = {
#         'random_seed': 42,
#         'iterations': trial.suggest_int('iterations', 300, 2000),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
#     }
    
#     model = CatBoostClassifier(**params, verbose=False, cat_features=lst)
#     model.fit(X_train, y_train)
#     predictions = model.predict(X_test)
#     return f1_score(predictions, y_test)

# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=100)
# study.best_params

In [20]:
best_params = {
    'cat_features': lst,
    'verbose': 0,
    'random_seed': 42,
    'learning_rate': 0.020169513723436013,
    'iterations': 2000,
    'depth': 6
}
# 0.8004410143329658

In [21]:
model = CatBoostClassifier(**best_params)
model.fit(X, y)
# y_pred = model.predict(X_test)
pred = model.predict(df_pred)

In [22]:
output = pd.DataFrame({'PassengerId': df_pred_PassengerId, 'Transported': pred})
output['Transported'] = output['Transported'].astype(bool)
output.to_csv('submission_cat.csv', index=False)

# kaggel score = 0.80243