## Pipline для построение модели табличного набора данных


1. Загрузка и изуччение данных
2. Постановка задачи, выбор метрики качества
3. Обработка пропущенных значений (ответвление: выбор стратегии заполнения пропущенных значений в зависимости от минимизации ошибки)
4. Генерация признаков
5. Моделирование с отбором признаков минимизирущих ошибку
6. Настройка гиперпарамтеров
7. Стекинг




In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import numpy as np


from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, PolynomialFeatures
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer


from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score


from xgboost.sklearn import XGBClassifier
from sklearn.pipeline import Pipeline

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC

from lightgbm import LGBMClassifier

from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer



from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, plot_confusion_matrix


In [3]:
glob('./1_titanic/*')

['./1_titanic/train.csv',
 './1_titanic/gender_submission.csv',
 './1_titanic/test.csv']

In [4]:
train_data = pd.read_csv('./1_titanic/train.csv', index_col='PassengerId')
test = pd.read_csv('./1_titanic/test.csv', index_col='PassengerId')

In [5]:
test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [7]:
train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [81]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
Pclass      418 non-null int64
Name        418 non-null object
Sex         418 non-null object
Age         332 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Ticket      418 non-null object
Fare        417 non-null float64
Cabin       91 non-null object
Embarked    418 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


## 0. Выделение предикторов и целевой переменной

In [9]:
target = train_data['Survived']
train = train_data.drop(columns='Survived')

## 1. Предобработка численных и категориальных переменных
### 1.1 Разбивка колонок по типам переменных

In [10]:
numeric_columns = [col_name  for col_name in train.columns \
                                           if (train[col_name].dtype != 'object')]


numeric_columns

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [11]:
category_columns = [col_name  for col_name in train.columns \
                                           if (train[col_name].dtype == 'object')]

category_columns

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [12]:
category_columns_nunique = {}
for col in category_columns:
    category_columns_nunique[col] = train[col].nunique()
print(category_columns_nunique)

{'Name': 891, 'Sex': 2, 'Ticket': 681, 'Cabin': 147, 'Embarked': 3}


### 1.1.1 Исключение неинформативных категориальных признаков (спорный момент возможно преобразование признаков)

In [13]:
#filter non informative
n = 0.5
for key in category_columns_nunique.keys():
    if category_columns_nunique[key] > len(train) * 0.5:
        if key in category_columns:
            category_columns.remove(key)
        
category_columns        

['Sex', 'Cabin', 'Embarked']

In [14]:
# здесь меняется порядок столбцов
filter_train = train[numeric_columns + category_columns]
filter_train.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Sex,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,22.0,1,0,7.25,male,,S
2,1,38.0,1,0,71.2833,female,C85,C
3,3,26.0,0,0,7.925,female,,S
4,1,35.0,1,0,53.1,female,C123,S
5,3,35.0,0,0,8.05,male,,S


### 1.2 Заполнение пропусков

In [15]:
# #fill NaN in numerics columns
# for col in numeric_columns:
#     if (~train[col].isna()).sum() != len(train[col]):
#         train[col] = train[col].fillna(train[col].median())
        
# another strategy fill mean/mediam for group
def imputer_for_nans(data, group_col_name, imputer_col_name, statistics='median'):
    return data[imputer_col_name].fillna(data.groupby(group_col_name)[imputer_col_name]\
                                         .transform(statistics))


for col in numeric_columns:
    if (~filter_train[col].isna()).sum() != len(train[col]):
        filter_train[col] = imputer_for_nans(filter_train, ['Pclass', 'Sex'], col) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
#fill NaN in category columns
for col in category_columns:
    if (~filter_train[col].isna()).sum() != len(filter_train[col]):
        filter_train[col] = filter_train[col].fillna('NA')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [None]:
#TODO обработка уникальных значений


### 1.3 Кодирование категориальных переменных (OneHotEncoding, OrdinalEncoding)

In [17]:
ohc_train = pd.get_dummies(filter_train, drop_first=True)

In [18]:
ohc_train.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Cabin_A14,Cabin_A16,Cabin_A19,Cabin_A20,...,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_NA,Cabin_T,Embarked_NA,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,22.0,1,0,7.25,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,1,38.0,1,0,71.2833,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,26.0,0,0,7.925,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,1,35.0,1,0,53.1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,3,35.0,0,0,8.05,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


### 1.4 Шкалирование данны

In [19]:
stsc = StandardScaler()
ohc_train[numeric_columns] = stsc.fit_transform(ohc_train[numeric_columns])

In [20]:
ohc_train.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Cabin_A14,Cabin_A16,Cabin_A19,Cabin_A20,...,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_NA,Cabin_T,Embarked_NA,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.827377,-0.534891,0.432793,-0.473674,-0.502445,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,-1.566107,0.668392,0.432793,-0.473674,0.786845,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.827377,-0.23407,-0.474545,-0.473674,-0.488854,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,-1.566107,0.442776,0.432793,-0.473674,0.42073,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,0.827377,0.442776,-0.474545,-0.473674,-0.486337,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


## 2. Split data

In [21]:
features_train, features_valid, target_train, target_valid = train_test_split(
    ohc_train, target, test_size=0.25, random_state=12345)

## 3. Modeling

In [23]:
def fit_clf_pipeline(clf, X_train, y_train):

    pipeline = Pipeline([('scaler', StandardScaler()),
                         ('feature_transform', PolynomialFeatures(degree=2)), 
                         ('classifier', clf)])

    return pipeline.fit(X_train, y_train)

In [25]:
%%time
random_state=1

rfc = fit_clf_pipeline(RandomForestClassifier(max_depth=10, n_estimators=1000, random_state=random_state),
                       features_train, 
                       target_train)

print(rfc.score(features_train, target_train))
print(rfc.score(features_valid, target_valid))

#models_collection['RandomForestClassifier'] = rfc

0.9640718562874252
0.7892376681614349
CPU times: user 5.07 s, sys: 0 ns, total: 5.07 s
Wall time: 5.07 s


## 4. Evalute model

In [131]:
train_xs, valid_xs, train_ys, valid_ys = train_test_split(train.to_dict(orient='records'), 
                                                          target, 
                                                          test_size=0.25, 
                                                          random_state=0)

In [132]:
clf = RandomForestClassifier()
vec = DictVectorizer()
pipeline = make_pipeline(vec, clf)

def evaluate(_clf):
    scores = cross_val_score(_clf, train.to_dict(orient='records'), target.values, scoring='accuracy', cv=10)
    print('Accuracy: {:.3f} ± {:.3f}'.format(np.mean(scores), 2 * np.std(scores)))
    _clf.fit(train_xs, train_ys)  # so that parts of the original pipeline are fitted

evaluate(pipeline)

Accuracy: 0.831 ± 0.081


In [137]:
vec2 = FeatureUnion([
    ('Name', CountVectorizer(
        analyzer='char_wb',
        ngram_range=(3, 4),
        preprocessor=lambda x: x['Name'],
        max_features=100,
    )),
    ('All', DictVectorizer()),
])
clf2 = RandomForestClassifier()
pipeline2 = make_pipeline(vec2, clf2)
evaluate(pipeline2)

Accuracy: 0.829 ± 0.084
