In [None]:
import numpy as np
import pandas as pd
from fastcore.all import *
from fastai.tabular.all import *

%matplotlib inline
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
train_df = pd.read_csv('./data/spaceship-titanic-train.csv')
test_df  = pd.read_csv('./data/spaceship-titanic-test.csv')

In [None]:
train_df.head()

In [None]:
targ = 'Transported'

In [None]:
train_df.info()

In [None]:
# Преобразование столбцов: категоризация (Label encoding), заполнение пропусков, нормализация
procs=[Categorify, FillMissing, Normalize]

In [None]:
def predproduction(data):
    """Редактирует входной DataFrame в части
    заполнения пропусков, определения количества людей с одинаковой фамилией,
    людей из одной группы (по номерам билетов),
    обрабатывает категориальные признаки по принципу One-Hot Encoding
    Возвращает отредактированный DataFrame
    """
    object_cols = data.columns[data.dtypes == 'object']
    num_cols =    data.columns[data.dtypes == 'float64']
    for col in object_cols:
        data[col].fillna(data[col].mode()[0], inplace=True)
    for col in num_cols:
        data[col].fillna(data[col].median(), inplace=True)
    
    fams = data['Name'].str.split(expand=True)[1]
    fam_counts = fams.value_counts()
    data['FamilySize'] = fams.apply(lambda x: fam_counts[x].astype('int'))
    data=data.drop('Name', axis=1)

    groups = data['PassengerId'].str.split('_', expand=True)[0]
    group_counts = groups.value_counts()
    data['GroupSize'] = groups.apply(lambda x: group_counts[x]).astype('int')
    data=data.drop('PassengerId', axis=1)
    
    data['CryoSleep'] = data['CryoSleep'].astype('int')
    data['VIP'] = data['VIP'].astype('int')
    data = data.drop('VIP',axis=1)
    
    data['Deck']=data['Cabin'].str.split('/', expand=True)[0]
    data['Side']=data['Cabin'].str.split('/', expand=True)[2]
    data=data.drop('Cabin', axis=1)
    
    data = pd.concat(
        [
            data,
            pd.get_dummies(data["HomePlanet"], prefix="HomePlanet"),
            pd.get_dummies(data["Destination"], prefix="Destination"),
            pd.get_dummies(data["Deck"], prefix="Deck"),
            pd.get_dummies(data["Side"], prefix="Side")
        ],
        axis=1,
    )
    data = data.drop('HomePlanet',axis=1)
    data = data.drop('Destination',axis=1)
    data = data.drop('Deck',axis=1)
    data = data.drop('Side',axis=1)
    
    data['Wastes'] = data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
   
    return data
    del_cols = ['Name', 'PassengerId', 'Cabin']
    for col in del_cols:
        data[col].fillna(data[col].mode()[0], inplace=True)
    
    fams = data['Name'].str.split(expand=True)[1]
    fam_counts = fams.value_counts()
    data['FamilySize'] = fams.apply(lambda x: fam_counts[x].astype('int'))
    data=data.drop('Name', axis=1)
    
    groups = data['PassengerId'].str.split('_', expand=True)[0]
    group_counts = groups.value_counts()
    data['GroupSize'] = groups.apply(lambda x: group_counts[x]).astype('int')
    data=data.drop('PassengerId', axis=1)
    
    data['Deck']=data['Cabin'].str.split('/', expand=True)[0]
    data['Side']=data['Cabin'].str.split('/', expand=True)[2]
    data=data.drop('Cabin', axis=1)
    
    data['Wastes'] = data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    
    object_cols = list(data.columns[data.dtypes == 'object'])
    num_cols =    list(data.columns[data.dtypes == 'float64']) + ['GroupSize', 'FamilySize']

    return (data, object_cols, num_cols)

In [None]:
train_data, cat_cols, con_cols = predproduction(train_df.copy())
test_data, _, _ = predproduction(test_df.copy())

In [None]:
# Деление выборки на обучающую и валидационную
split_num = int(np.floor(0.3 * train_df.shape[0]))
indices = list(range(train_df.shape[0]))
np.random.seed()
np.random.shuffle(indices)
valid_idx, train_idx = indices[:split_num], indices[split_num:]
splits=(list(train_idx), list(valid_idx))

In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
dls = TabularDataLoaders.from_df(train_data, procs=procs, cat_names=cat_cols,cont_names=con_cols,
                                 y_names=targ, valid_idx=list(valid_idx), bs=128)

In [None]:
learn = tabular_learner(dls, layers=[500,250], metrics=accuracy)

In [None]:
# Поиск оптимальной скорости обучения
learn.lr_find()

In [None]:
learn.fit_one_cycle(10, 1e-3)

In [None]:
test_dl = learn.dls.test_dl(test_data)
_, _, label = learn.get_preds(dl=test_dl, with_decoded=True)

In [None]:
def write_to_submission_file(predicted_labels, passid, out_file,
                             target='Transported', index_label="PassengerId"):
    """Переводит предсказания модели в DataFrame и сохранение в csv-файл"""
    predicted_df = pd.DataFrame(predicted_labels,
                                passid,
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [None]:
write_to_submission_file(np.array(label).astype('bool'), test_df['PassengerId'], 'spacetitanic_pred_fai.csv')