## Pipline для построение модели табличного набора данных


1. Загрузка и изуччение данных
2. Постановка задачи, выбор метрики качества
3. Обработка пропущенных значений (ответвление: выбор стратегии заполнения пропущенных значений в зависимости от минимизации ошибки)
4. Генерация признаков
5. Моделирование с отбором признаков минимизирущих ошибку
6. Настройка гиперпарамтеров
7. Стекинг




In [73]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob


from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer


from sklearn.model_selection import train_test_split


from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

In [3]:
glob('./1_titanic/*')

['./1_titanic/train.csv',
 './1_titanic/gender_submission.csv',
 './1_titanic/test.csv']

In [79]:
train_data = pd.read_csv('./1_titanic/train.csv', index_col='PassengerId')
test = pd.read_csv('./1_titanic/test.csv', index_col='PassengerId')

In [9]:
test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [80]:
train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [81]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [82]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
Pclass      418 non-null int64
Name        418 non-null object
Sex         418 non-null object
Age         332 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Ticket      418 non-null object
Fare        417 non-null float64
Cabin       91 non-null object
Embarked    418 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


## 0. Выделение предикторов и целевой переменной

In [84]:
target = train_data[['Survived']]
train = train_data.drop(columns='Survived')

## 1. Предобработка численных и категориальных переменных
### 1.1 Разбивка колонок по типам переменных

In [85]:
category_columns = [col_name  for col_name in train.columns \
                                           if (train[col_name].dtype == 'object')]

category_columns

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [86]:
category_columns_nunique = {}
for col in category_columns:
    category_columns_nunique[col] = train[col].nunique()
print(category_columns_nunique)

{'Name': 891, 'Sex': 2, 'Ticket': 681, 'Cabin': 147, 'Embarked': 3}


In [87]:
numeric_columns = [col_name  for col_name in train.columns \
                                           if (train[col_name].dtype != 'object')]


numeric_columns

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

### 1.2 Заполнение пропусков

In [88]:
train[numeric_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 5 columns):
Pclass    891 non-null int64
Age       714 non-null float64
SibSp     891 non-null int64
Parch     891 non-null int64
Fare      891 non-null float64
dtypes: float64(2), int64(3)
memory usage: 41.8 KB


In [89]:
# #fill NaN in numerics columns
# for col in numeric_columns:
#     if (~train[col].isna()).sum() != len(train[col]):
#         train[col] = train[col].fillna(train[col].median())
        
# another strategy fill mean/mediam for group
def imputer_for_nans(data, group_col_name, imputer_col_name, statistics='median'):
    return data[imputer_col_name].fillna(data.groupby(group_col_name)[imputer_col_name]\
                                         .transform(statistics))


for col in numeric_columns:
    if (~train[col].isna()).sum() != len(train[col]):
        train[col] = imputer_for_nans(train, ['Pclass', 'Sex'], col) 

In [90]:
train[numeric_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 5 columns):
Pclass    891 non-null int64
Age       891 non-null float64
SibSp     891 non-null int64
Parch     891 non-null int64
Fare      891 non-null float64
dtypes: float64(2), int64(3)
memory usage: 41.8 KB


In [91]:
train[category_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 5 columns):
Name        891 non-null object
Sex         891 non-null object
Ticket      891 non-null object
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: object(5)
memory usage: 41.8+ KB


In [92]:
#fill NaN in category columns
for col in category_columns:
    if (~train[col].isna()).sum() != len(train[col]):
        train[col] = train[col].fillna('NA')

In [93]:
train[category_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 5 columns):
Name        891 non-null object
Sex         891 non-null object
Ticket      891 non-null object
Cabin       891 non-null object
Embarked    891 non-null object
dtypes: object(5)
memory usage: 41.8+ KB


### 1.3 Кодирование категориальных переменных

In [75]:
ohc_train = pd.get_dummies(train[category_columns], drop_first=True)

In [96]:
import sklearn

In [94]:
train_xs, valid_xs, train_ys, valid_ys = train_test_split(train, target, test_size=0.25, random_state=0)

In [95]:


clf = XGBClassifier()
vec = DictVectorizer()
pipeline = make_pipeline(vec, clf)

def evaluate(_clf):
    scores = cross_val_score(_clf, all_xs, all_ys, scoring='accuracy', cv=10)
    print('Accuracy: {:.3f} ± {:.3f}'.format(np.mean(scores), 2 * np.std(scores)))
    _clf.fit(train_xs, train_ys)  # so that parts of the original pipeline are fitted

evaluate(pipeline)

XGBoostError: sklearn needs to be installed in order to use this module