# Домашняя работа по лекции №6

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 8)

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df_gender = pd.read_csv('data/customers_gender_train.csv')
df_gender.head()

Unnamed: 0,customer_id,gender
0,75562265,0
1,10928546,1
2,69348468,1
3,84816985,1
4,61009479,0


In [4]:
df_transactions_ = pd.read_csv('data/transactions.csv.gz')
df_transactions_.head()

Unnamed: 0,customer_id,tr_datetime,mcc_code,tr_type,amount,term_id
0,39026145,0 10:23:26,4814,1030,-2245.92,
1,39026145,1 10:19:29,6011,7010,56147.89,
2,39026145,1 10:20:56,4829,2330,-56147.89,
3,39026145,1 10:39:54,5499,1010,-1392.47,
4,39026145,2 15:33:42,5499,1010,-920.83,


In [5]:
df_tr = pd.read_csv('data/tr_types.csv', sep=';')

In [178]:
df_tr.head()

Unnamed: 0,tr_type,tr_description
0,3200,Плата за предоставление услуг посредством моби...
1,3210,Плата за предоставление отчета по счету карты ...
2,3800,Плата за обслуживание банковской карты (за пер...
3,4000,Плата за получение наличных в Сбербанке
4,4001,Плата за получение наличных в Сбербанке (в дру...


In [7]:
df_mcc = pd.read_csv('data/tr_mcc_codes.csv', sep=';')

In [179]:
df_mcc.head()

Unnamed: 0,mcc_code,mcc_description
0,742,Ветеринарные услуги
1,1711,"Генеральные подрядчики по вентиляции, теплосна..."
2,1731,Подрядчики по электричеству
3,1799,"Подрядчики, специализированная торговля — нигд..."
4,2741,Разнообразные издательства/печатное дело


In [9]:
from pandas import Timestamp, DateOffset

In [10]:
def preproc_transactions(df_transactions):
    sec_per_day = 86400
    sec_per_hour = 3600
    
    start_date = 1420070400 - 154 * sec_per_day - 3 * sec_per_hour
    
    df_transactions.loc[:, 'day'] = df_transactions["tr_datetime"]\
                                               .str.split(' ')\
                                               .str.get(0)\
                                               .astype(int)
    df_transactions.loc[:, 'time_raw'] = df_transactions["tr_datetime"]\
                                                    .str.split(' ')\
                                                    .str.get(1)

    # set temp dt
    df_transactions.loc[:, 'dt_temp'] = pd.to_datetime(df_transactions.loc[:, 'time_raw'], 
                                                    format='%H:%M:%S')\
                                        + DateOffset(years=115)
    
    df_transactions = df_transactions.assign(dt = lambda x: x.dt_temp.astype(np.int64) // 10**9
                                             + (x.day - 153) * sec_per_day)\
                                     .assign(weekday = lambda x: ((x.day + 4) % 7 + 1))
        
    df_transactions.loc[:, 'datetime'] = pd.to_datetime(df_transactions.dt, unit='s')
    df_transactions.loc[:, 'date'] = df_transactions.loc[:, 'datetime'].dt.strftime('%Y-%m-%d')
    df_transactions.loc[:, 'hour'] = df_transactions.loc[:, 'datetime'].dt.strftime('%H')
    
    df_transactions = df_transactions.drop(['dt_temp', 'time_raw', 'tr_datetime'], axis=1)
    
    df_transactions.loc[:, 'amount'] = np.round(df_transactions.loc[:, 'amount']/(np.pi**np.exp(1)))
            
    return df_transactions

In [11]:
df_transactions = df_transactions_.pipe(preproc_transactions)

In [13]:
df_transactions.head()

Unnamed: 0,customer_id,mcc_code,tr_type,amount,term_id,day,dt,weekday,datetime,date,hour
0,39026145,4814,1030,-100.0,,0,1406888606,5,2014-08-01 10:23:26,2014-08-01,10
1,39026145,6011,7010,2500.0,,1,1406974769,6,2014-08-02 10:19:29,2014-08-02,10
2,39026145,4829,2330,-2500.0,,1,1406974856,6,2014-08-02 10:20:56,2014-08-02,10
3,39026145,5499,1010,-62.0,,1,1406975994,6,2014-08-02 10:39:54,2014-08-02,10
4,39026145,5499,1010,-41.0,,2,1407080022,7,2014-08-03 15:33:42,2014-08-03,15


In [168]:
def gen_features(gen_features, df_transactions, df_mcc, df_tr):
    df_transactions_mcc = df_transactions.join(
        df_mcc.set_index('mcc_code'), 
        on='mcc_code'
    )
    
#     df_transactions_mcc_pivot = df_transactions_mcc.pivot_table(
#         index=['customer_id'], 
#         values='mcc_description', 
#         aggfunc=lambda x: ''.join(x)
#     )
        
    df_weekday_rations = \
        df_transactions.pivot_table(index=['customer_id'], columns='weekday', values='amount', 
                             aggfunc=np.size, fill_value=0)
    
    total = df_weekday_rations.sum(axis=1)
    df_weekday_rations.loc[:, 'saturday'] = (df_weekday_rations.iloc[:, 5:6].sum(axis=1).T/total).T
    df_weekday_rations.loc[:, 'sunday'] = (df_weekday_rations.iloc[:, 6:7 ].sum(axis=1).T/total).T
    df_weekday_rations.loc[:, 'friday'] = (df_weekday_rations.iloc[:, 4:5].sum(axis=1).T/total).T
    df_weekday_rations.loc[:, 'weekdays'] = (df_weekday_rations.iloc[:, :4].sum(axis=1).T/total).T
    
    df_features = df_gender.join(df_weekday_rations.loc[:, ['saturday', 'sunday', 'friday', 'weekdays']], on='customer_id', how='left')\
#                          .join(df_transactions_mcc, on='customer_id', how='left')\
#                                  .join(df_tr_counts, on='customer_id', how='left')\
#                                  .join(df_mcc_counts, on='customer_id', how='left')\
    df_features = df_features.merge(df_transactions_mcc, on='customer_id', how='left')
    return df_features

In [197]:
df_features = df_gender.pipe(
    gen_features, 
    df_transactions[:],    
    df_mcc, 
    df_transactions
)
df_features = df_features.drop(['dt', 'day', 'datetime', 'date', 'hour', 'weekday'], axis=1)
df_tmp = df_features.fillna(value=-1)
df_tmp['mcc_description'] = df_tmp['mcc_description'].apply(
        lambda x: x if x != -1 else ""
)
df_features = df_tmp

In [198]:
df_tmp.head()

Unnamed: 0,customer_id,gender,saturday,sunday,friday,weekdays,mcc_code,tr_type,amount,term_id,mcc_description
0,75562265,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,
1,10928546,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,
2,69348468,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,
3,84816985,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,
4,61009479,0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,


In [199]:
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import cross_val_score
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK, rand

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


In [200]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, col_idx):
        self.col_idx = col_idx
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[:, self.col_idx]

In [201]:
class ColumnTextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, col_idx):
        self.col_idx = col_idx
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[:, self.col_idx]

In [202]:
label = 'gender'
RND_SEED = 123
idx_features = df_features.columns != label

In [203]:
X = df_features.loc[:, idx_features].values
y = df_features.loc[:, ~idx_features].values.flatten()

In [204]:
def run_trials_template(X, y, params, evals=100):

    def hyperopt_cv(X, y, params):
        
        X_ = X.copy()
        
        # Отделяем параметры лог регрессии в отдельный словарь
        lm_params = {}
        for k, v in params.items():
            if k.startswith('glob'):
                continue                
            elif k.startswith('lm'):
                lm_params[k.split('_', 1)[1]] = v
        
        # Задаем шкалирование
        if params['scaler_type'] == 'standart':
            scaler = StandardScaler(with_mean=params['scaler_centering'])
        else:
            assert params['scaler_type'] == 'robust'
            scaler = RobustScaler(with_centering=params['scaler_centering'])
        
        # Создаем лог рег с нужными параметрами
        clf = LogisticRegression(**lm_params)
        
        model = Pipeline([
            ('union', FeatureUnion(
                # Use FeatureUnion to combine the features from subject and body
                transformer_list=[
                # Pipeline for pulling features from the post's subject line
                    ('scaler', Pipeline([
                            ('selector', ColumnSelector(col_idx=[1,2,3,4])),
                            ('scaler', scaler),
                        ])
                    ),
#                     Pipeline for standard bag-of-words model
                    ('text', Pipeline([
                            ('selector', ColumnSelector(col_idx=9)),
                            ('tfidf', TfidfVectorizer(
                                ngram_range=(1,1),
                                max_df=0.95, 
                                min_df=5
                                )
                            ),
                        ])
                    ),
                ],# end  transformer_list
                # weight components in FeatureUnion
                transformer_weights={
                    'scaler': 0.8,
                    'text': 0.5,
                }) # end FeatureUnion
            ),# end union
            ('lr', clf),
        ]) # end pipeline
        
        # Схема кросс-валидации
        n_splits = 5
        cv = StratifiedKFold(
            n_splits=n_splits, shuffle=True, 
                random_state=RND_SEED)
        
        scores = cross_val_score(
            model, X_, y,
            scoring='roc_auc', 
            cv=cv, 
            n_jobs=-1)

        # Возвращаем среднее значение метрики и отклонение (на всякий случай)
        return scores.mean(), scores.std()

    def f(params):
        acc, std = hyperopt_cv(X, y, params)
        return {'loss': -acc, 'qscore': -acc, 'qscore_std': std, 'status': STATUS_OK}

    trials = Trials()
    best = fmin(
        f, 
        params, 
        algo=tpe.suggest, 
        max_evals=evals, 
        trials=trials, 
        verbose=10)
    
    return trials

In [205]:
# Задаем пространство поиска
space4_lm = {
    'lm_penalty': hp.choice('penalty', ['l2',]),
     'lm_penalty': hp.choice('penalty', ['l1', 'l2']),
     'lm_C': hp.loguniform('C', -5, 3),
    'lm_C': hp.loguniform('C', -1, 3),
     'lm_class_weight': hp.choice('class_weight', [None, 'balanced']),
    'lm_random_state': RND_SEED,
    'scaler_type': hp.choice('scaler_type', ['standart',]),
    'scaler_type': hp.choice('scaler_type', ['standart', 'robust']),
    'scaler_centering': hp.choice('scaler_centering', [False, ])
    'scaler_centering': hp.choice('scaler_centering', [False, True])
}

In [206]:
trials = run_trials_template(X, y, space4_lm, evals=40)

In [207]:
def trials_df(trials):
    '''
    Функция форматирует результаты hyperopt в dataframe
    '''
    tr_dict = []
    for t in trials:
        trial = dict()
        for k, v in t['misc']['vals'].items():
            trial[k] = v[0]

        trial['qscore'] = -t['result']['qscore']
        trial['qscore_std'] = -t['result']['qscore_std']
        tr_dict.append(trial)

    df_res = pd.DataFrame.from_dict(tr_dict)
    df_res = df_res.sort_values('qscore', ascending=False)
    
    return df_res

In [208]:
df_trials = trials_df(trials)

In [209]:
df_trials.head()

Unnamed: 0,C,penalty,qscore,qscore_std,scaler_centering,scaler_type
35,19.632461,0,0.795576,-0.005126,0,0
25,19.598867,0,0.795576,-0.005126,0,0
27,19.802187,0,0.795575,-0.005123,0,0
28,19.441176,0,0.795436,-0.00518,0,0
31,12.962256,0,0.795333,-0.004723,0,0
