In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

%matplotlib inline
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
train_df = pd.read_csv('./data/spaceship-titanic-train.csv')
test_df = pd.read_csv('./data/spaceship-titanic-test.csv')
sample_df = pd.read_csv('./data/spaceship-titanic-sample.csv')

In [None]:
sample_df.info()

In [None]:
train_df.info()

In [None]:
train_df.describe(include='all')

In [None]:
test_df.describe(include='all')

In [None]:
train_df.head(5)

In [None]:
plt.scatter(
    x=train_df['RoomService'],
    y=train_df['FoodCourt'],
    color=train_df['Transported'].map({False: 'orange', True: 'blue'})
                                                                                         )

In [None]:
plt.scatter(
    x=train_df['Age'],
    y=train_df['VRDeck'],
    color=train_df['Transported'].map({False: 'orange', True: 'blue'})
                                                                                         )

In [None]:
plt.scatter(
    x=train_df['Age'],
    y=train_df['FoodCourt'],
    color=train_df['Transported'].map({False: 'orange', True: 'blue'})
                                                                                         )

In [None]:
sns.heatmap(train_df.corr())

In [None]:
train_df.groupby('Transported')['FoodCourt'].mean(), \
train_df.groupby('Transported')['Spa'].mean(), \
train_df.groupby('Transported')['VRDeck'].mean(), \
train_df.groupby('Transported')['ShoppingMall'].mean(), \
train_df.groupby('Transported')['RoomService'].mean()

In [None]:
sns.countplot(x='CryoSleep', hue='Transported', data=train_df)

In [None]:
sns.countplot(x='HomePlanet', hue='Transported', data=train_df)

In [None]:
sns.countplot(x='VIP', hue='Transported', data=train_df)

In [None]:
train_df['Transported'] = train_df['Transported'].astype('int')
y=train_df['Transported']

In [None]:
def Predproduction(data):
    object_cols = data.columns[data.dtypes == 'object']
    num_cols =    data.columns[data.dtypes == 'float64']
    for col in object_cols:
        data[col].fillna(data[col].mode()[0], inplace=True)
    for col in num_cols:
        data[col].fillna(data[col].median(), inplace=True)
    
    fams = data['Name'].str.split(expand=True)[1]
    fam_counts = fams.value_counts()
    data['FamilySize'] = fams.apply(lambda x: fam_counts[x].astype('int'))
    data=data.drop('Name', axis=1)

    groups = data['PassengerId'].str.split('_', expand=True)[0]
    group_counts = groups.value_counts()
    data['GroupSize'] = groups.apply(lambda x: group_counts[x]).astype('int')
    data=data.drop('PassengerId', axis=1)
    
    data['CryoSleep'] = data['CryoSleep'].astype('int')
    data['VIP'] = data['VIP'].astype('int')
    data = data.drop('VIP',axis=1)
    
    data['Deck']=data['Cabin'].str.split('/', expand=True)[0]
    data['Side']=data['Cabin'].str.split('/', expand=True)[2]
    data=data.drop('Cabin', axis=1)
    
    data = pd.concat(
        [
            data,
            pd.get_dummies(data["HomePlanet"], prefix="HomePlanet"),
            pd.get_dummies(data["Destination"], prefix="Destination"),
            pd.get_dummies(data["Deck"], prefix="Deck"),
            pd.get_dummies(data["Side"], prefix="Side")
        ],
        axis=1,
    )
    data = data.drop('HomePlanet',axis=1)
    data = data.drop('Destination',axis=1)
    data = data.drop('Deck',axis=1)
    data = data.drop('Side',axis=1)
    
    data['Wastes'] = data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
   
    return data

In [None]:
def write_to_submission_file(predicted_labels, passid, out_file,
                             target='Transported', index_label="PassengerId"):
# Перевод предсказаний модели в DataFrame и сохранение в csv-файл
    predicted_df = pd.DataFrame(predicted_labels,
                                passid,
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [None]:
train_df_new=Predproduction(train_df)
test_df_new=Predproduction(test_df)
train_df_new = train_df_new.drop('Transported', axis=1)

In [None]:
%%time
# Поиск наилучших параметров модели
gb_params = {
    'n_estimators': list(range(100,300,20)),
    'min_samples_leaf': list(range(3,11)),
    'max_depth': [3,5,7,9],
    'random_state': [17],
    'max_features': [0.3, 0.5, 0.7]
}
gb=GradientBoostingClassifier()
gsCV=GridSearchCV(gb, gb_params, cv=5)
gsCV.fit(train_df_new, y)
gsCV.best_params_, gsCV.best_score_

In [None]:
gb=GradientBoostingClassifier(n_estimators=200, max_depth=5, min_samples_leaf=9, max_features=0.3)
gb.fit(train_df_new, y)

In [None]:
res=gb.predict(test_df_new)

In [None]:
write_to_submission_file(res.astype('bool'), test_df['PassengerId'], 'spacetitanic_pred_.csv')