In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm
pd.set_option('display.max_columns', 500)

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [2]:
raw_df = pd.read_csv('rosbank_train.csv')
raw_df['channel_type'] = raw_df['channel_type'].fillna('type0')
raw_df.drop(['target_sum'], axis=1,inplace=True)
raw_df['target_flag'].value_counts()
raw_df.info()
cl_ids_test = np.random.choice(raw_df.cl_id.unique(), size=1000, replace=False)
cl_ids_test_set = set(cl_ids_test)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 490513 entries, 0 to 490512
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   PERIOD        490513 non-null  object 
 1   cl_id         490513 non-null  int64  
 2   MCC           490513 non-null  int64  
 3   channel_type  490513 non-null  object 
 4   currency      490513 non-null  int64  
 5   TRDATETIME    490513 non-null  object 
 6   amount        490513 non-null  float64
 7   trx_category  490513 non-null  object 
 8   target_flag   490513 non-null  int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 33.7+ MB


In [3]:
# create transactions dataset for train
transactions_train = raw_df[~raw_df.cl_id.isin(cl_ids_test)].copy()
print("Total transactions in train dataset: ", len(transactions_train))
# create transactions dataset for test
transactions_test = raw_df[raw_df.cl_id.isin(cl_ids_test)].copy()
print("Total transactions in test dataset: ", len(transactions_test))

Total transactions in train dataset:  389856
Total transactions in test dataset:  100657


#### Working with currency rates

In [4]:
codes = pd.read_csv('./codes-all.csv')
codes = codes[['NumericCode','AlphabeticCode']]
codes.columns = ['currency','AlphabeticCode']
codes = codes.dropna().reset_index(drop=True)
codes['currency'] = codes['currency'].astype('int')
codes = codes.drop_duplicates(subset=['currency']).reset_index(drop=True)
print(codes.shape)

(249, 2)


In [5]:
transactions_train = pd.merge(transactions_train, codes, how='left', on = 'currency')
transactions_test = pd.merge(transactions_test, codes, how='left', on = 'currency')

In [6]:
curr = pd.read_csv('./curr.csv')
curr['CLOSE'] = curr['CLOSE'] / curr['NOMINAL']
curr = curr[['TICKER','DATE','CLOSE']]
curr.columns = ['AlphabeticCode','time','CLOSE']
curr['time'] = pd.to_datetime(curr['time'])

### Transaction amount into neg/pos values 

In [7]:
transactions_train['amount'] = -transactions_train['amount']
ind = (transactions_train['trx_category']=='C2C_IN') | (transactions_train['trx_category']=='DEPOSIT') | (transactions_train['trx_category']=='BACK_TRX')
transactions_train['amount'][ind] = abs(transactions_train['amount'][ind])

transactions_test['amount'] = -transactions_test['amount']
ind = (transactions_test['trx_category']=='C2C_IN') | (transactions_test['trx_category']=='DEPOSIT') | (transactions_test['trx_category']=='BACK_TRX')
transactions_test['amount'][ind] = abs(transactions_test['amount'][ind])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transactions_train['amount'][ind] = abs(transactions_train['amount'][ind])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transactions_test['amount'][ind] = abs(transactions_test['amount'][ind])


In [8]:
from sklearn.preprocessing import LabelEncoder
cat_cols = ['channel_type','trx_category']
for i in ['channel_type','trx_category']:
    le = LabelEncoder()
    le.fit( pd.concat([transactions_train[[i]], transactions_test[[i]]], axis=0) )
    transactions_train[i] = le.transform(transactions_train[i])
    transactions_test[i] = le.transform(transactions_test[i])

  return f(**kwargs)


In [9]:
dt_dict = {
    'JAN':'01',
    'FEB':'02',
    'MAR':'03',
    'APR':'04',
    'MAY':'05', 
    'JUN':'06',  
    'JUL':'07',
    'AUG':'08',
    'SEP':'09',
    'OCT':'10',
    'NOV':'11', 
    'DEC':'12',   
}

In [10]:
def dt_preprocess1(t):
    day = t[0:2]
    month = dt_dict[t[2:5]]
    year = t[5:7]
    
    hour = t[8:10]
    minute = t[11:13]
    second = t[14:]
    
    return '20'+str(year)+'-'+str(month)+'-'+str(day)

In [11]:
transactions_train['time'] = transactions_train['TRDATETIME'].apply(lambda x: dt_preprocess1(x))
transactions_train['time'] = pd.to_datetime(transactions_train['time'])
transactions_test['time'] = transactions_test['TRDATETIME'].apply(lambda x: dt_preprocess1(x))
transactions_test['time'] = pd.to_datetime(transactions_train['time'])

In [12]:
def dt_preprocess(t):
    day = t[0:2]
    month = dt_dict[t[2:5]]
    year = t[5:7]
    
    hour = t[8:10]
    minute = t[11:13]
    second = t[14:]
    
    return '20'+str(year)+'-'+str(month)+'-'+str(day)+' '+str(hour)+":"+str(minute)+':'+str(second)

In [13]:
transactions_train['TRDATETIME'] = transactions_train['TRDATETIME'].apply(lambda x: dt_preprocess(x))
transactions_train['TRDATETIME'] = pd.to_datetime(transactions_train['TRDATETIME'])
transactions_test['TRDATETIME'] = transactions_test['TRDATETIME'].apply(lambda x: dt_preprocess(x))
transactions_test['TRDATETIME'] = pd.to_datetime(transactions_test['TRDATETIME'])

In [14]:
transactions_train['month'] = transactions_train['TRDATETIME'].dt.month + (transactions_train['TRDATETIME'].dt.year-2016)*12
transactions_test['month'] = transactions_test['TRDATETIME'].dt.month + (transactions_test['TRDATETIME'].dt.year-2016)*12

transactions_train['days_since2015'] = (transactions_train['TRDATETIME'] - pd.to_datetime('2015-01-01')).dt.days
transactions_test['days_since2015'] = (transactions_test['TRDATETIME'] - pd.to_datetime('2015-01-01')).dt.days

In [15]:
transactions_test = pd.merge(transactions_test, curr, how='left', on=['AlphabeticCode','time'])
transactions_train = pd.merge(transactions_train, curr, how='left', on=['AlphabeticCode','time'])

In [16]:
transactions_train['CLOSE'] = transactions_train['CLOSE'].fillna(1)
transactions_train['amount'] = transactions_train['amount'] * transactions_train['CLOSE']
transactions_train = transactions_train.drop(['currency', 'AlphabeticCode', 'CLOSE','time'], axis=1)
transactions_test['CLOSE'] = transactions_test['CLOSE'].fillna(1)
transactions_test['amount'] = transactions_test['amount'] * transactions_test['CLOSE']
transactions_test = transactions_test.drop(['currency', 'AlphabeticCode', 'CLOSE','time'], axis=1)

### Создание DF по клиентам  + Фича времени в работе

In [17]:
def diff_monthes(x):
    a = np.max(x) - np.min(x)
    return a.days

sum_deals = transactions_train[['cl_id','TRDATETIME']].groupby('cl_id').agg(diff_monthes).reset_index()
sum_deals.columns = ['cl_id','num_days_in_use']
df_train =  sum_deals

sum_deals = transactions_test[['cl_id','TRDATETIME']].groupby('cl_id').agg(diff_monthes).reset_index()
sum_deals.columns = ['cl_id','num_days_in_use']
df_test =  sum_deals

### Разложение Weights of Evidence (WOE) фичи MCC кодов

In [18]:
def get_woe_v1(df_train, df_test, col, target_col):
    all_good = len(df_train[df_train[target_col] == 1][col])
    all_bad = len(df_train[df_train[target_col] == 0][col])
    odds_series = (
        df_train[df_train[target_col] == 1][col].value_counts()
        /
        df_train[df_train[target_col] == 0][col].value_counts()
    )
    odds_series = odds_series / all_good * all_bad
    category_woe_dict = np.log(odds_series).to_dict()
    df_train[col + '_woe'] = df_train[col].apply(category_woe_dict.get)
    df_test[col + '_woe'] = df_test[col].apply(category_woe_dict.get)
    return df_train, df_test

In [19]:
columns_to_get_counts = [
    'trx_category',
    'MCC',
    'channel_type'
]
    
for col_get_prob in columns_to_get_counts:
    transactions_train, transactions_test = get_woe_v1(transactions_train, transactions_test, col_get_prob, 'target_flag')

##### Средние значения по MCC и каналу привлечения

In [20]:
from scipy.stats import mode
def mmode(x):
    return mode(x)[0][0]

for i in tqdm(['trx_category_woe', 'MCC_woe', 'channel_type_woe']):
    temp = transactions_train[['cl_id',i]].groupby('cl_id').agg(mmode).reset_index()
    
    temp = pd.DataFrame(temp.values)
    new_cols = ['cl_id',i]

    temp.columns = new_cols
    df = temp
    df_train = pd.merge(df_train, temp, how='left', on='cl_id')
    
for i in tqdm(['trx_category_woe', 'MCC_woe', 'channel_type_woe']):
    temp = transactions_test[['cl_id',i]].groupby('cl_id').agg(mmode).reset_index()
    
    temp = pd.DataFrame(temp.values)
    new_cols = ['cl_id',i]

    temp.columns = new_cols
    df = temp
    df_test = pd.merge(df_test, temp, how='left', on='cl_id')

100%|████████████████████████████████████████████| 3/3 [00:01<00:00,  1.72it/s]
100%|████████████████████████████████████████████| 3/3 [00:00<00:00,  6.61it/s]


In [21]:
num_deals = transactions_train[['cl_id','amount']].groupby('cl_id').agg('count').reset_index()
num_deals.columns = ['cl_id','num_deals']
df_train = pd.merge(df_train, num_deals, how='left', on='cl_id')

sum_deals = transactions_train[['cl_id','amount']].groupby('cl_id').agg('sum').reset_index()
sum_deals.columns = ['cl_id','sum_deals']
df_train = pd.merge(df_train, sum_deals, how='left', on='cl_id')

In [22]:
num_deals = transactions_test[['cl_id','amount']].groupby('cl_id').agg('count').reset_index()
num_deals.columns = ['cl_id','num_deals']
df_test = pd.merge(df_test, num_deals, how='left', on='cl_id')

sum_deals = transactions_test[['cl_id','amount']].groupby('cl_id').agg('sum').reset_index()
sum_deals.columns = ['cl_id','sum_deals']
df_test = pd.merge(df_test, sum_deals, how='left', on='cl_id')

In [23]:
test_y = transactions_test[['cl_id','target_flag']].groupby('cl_id').agg('mean').reset_index()
test_y.columns = ['cl_id','target_flag']
df_test = pd.merge(df_test, test_y, how='left', on='cl_id')

train_y = transactions_train[['cl_id','target_flag']].groupby('cl_id').agg('mean').reset_index()
train_y.columns = ['cl_id','target_flag']
df_train = pd.merge(df_train, train_y, how='left', on='cl_id')

### Разложение категорий транзакций

In [24]:
to_agg = ['sum','mean','std','median','max','min','count']
for j in tqdm(pd.unique(transactions_test['trx_category'])):
    i = 'amount'
    temp = transactions_test[transactions_test['trx_category']==j][['cl_id',i]].groupby('cl_id').agg(to_agg).reset_index().fillna(0)
    new_cols = ['cl_id']
    for p in range(1,temp.columns.shape[0]):
        new_cols.append('trx_category'+'_'+str(j)+'_'+i+'_'+str(to_agg[p-1]))
    temp.columns = new_cols
    df_test = pd.merge(df_test, temp, how='left', on='cl_id')
    
for j in tqdm(pd.unique(transactions_train['trx_category'])):
    i = 'amount'
    temp = transactions_train[transactions_train['trx_category']==j][['cl_id',i]].groupby('cl_id').agg(to_agg).reset_index().fillna(0)
    new_cols = ['cl_id']
    for p in range(1,temp.columns.shape[0]):
        new_cols.append('trx_category'+'_'+str(j)+'_'+i+'_'+str(to_agg[p-1]))
    temp.columns = new_cols
    df_train = pd.merge(df_train, temp, how='left', on='cl_id')

100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 59.16it/s]
100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 43.10it/s]


### Работа с датами

In [25]:
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)### Работа с датами

In [26]:
month = transactions_train[['cl_id','month']].groupby('cl_id').mean().reset_index()
month.columns = ['cl_id','month']
df_train = pd.merge(df_train, month, how='left', on='cl_id')

month = transactions_test[['cl_id','month']].groupby('cl_id').mean().reset_index()
month.columns = ['cl_id','month']
df_test = pd.merge(df_test, month, how='left', on='cl_id')

days_since2015 = transactions_train[['cl_id','days_since2015']].groupby('cl_id').mean().reset_index()
days_since2015.columns = ['cl_id','days_since2015']
df_train = pd.merge(df_train, days_since2015, how='left', on='cl_id')

days_since2015 = transactions_test[['cl_id','days_since2015']].groupby('cl_id').mean().reset_index()
days_since2015.columns = ['cl_id','days_since2015']
df_test = pd.merge(df_test, days_since2015, how='left', on='cl_id')

### Работа над разложением MCC кодов на фичи и их разложение на главные компоненты

for j in tqdm(pd.unique(transactions_train['MCC'])):
    i = 'amount'
    temp = transactions_train[transactions_train['MCC']==j][['cl_id',i]].groupby('cl_id').agg('sum').reset_index().fillna(0)
    new_cols = ['cl_id']
    for p in range(1,temp.columns.shape[0]):
        new_cols.append('MCC'+'_'+str(j)+'_'+i+'_'+str(to_agg[p-1]))
    temp.columns = new_cols
    df_train = pd.merge(df_train, temp, how='left', on='cl_id')

for j in tqdm(pd.unique(transactions_test['MCC'])):
    i = 'amount'
    temp = transactions_test[transactions_test['MCC']==j][['cl_id',i]].groupby('cl_id').agg('sum').reset_index().fillna(0)
    new_cols = ['cl_id']
    for p in range(1,temp.columns.shape[0]):
        new_cols.append('MCC'+'_'+str(j)+'_'+i+'_'+str(to_agg[p-1]))
    temp.columns = new_cols
    df_test = pd.merge(df_test, temp, how='left', on='cl_id')

df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

df_train = df_train[df_train.columns & df_test.columns]
df_test = df_train[df_train.columns & df_test.columns]

In [27]:
corr_matrix = df_train.corr()
corr = corr_matrix["target_flag"].sort_values(ascending=False)
corr

target_flag                    1.000000
channel_type_woe               0.359781
num_days_in_use                0.271573
num_deals                      0.250341
trx_category_6_amount_count    0.239438
                                 ...   
trx_category_7_amount_min     -0.066198
trx_category_9_amount_min     -0.086605
trx_category_7_amount_sum     -0.089278
trx_category_5_amount_count   -0.106894
cl_id                         -0.388435
Name: target_flag, Length: 80, dtype: float64

### Scaling (не помогло)

from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
df_train_scale = scale.fit_transform(df_train.drop(['target_flag'],axis=1))
df_test_scale = scale.fit_transform(df_test.drop(['target_flag'],axis=1))
X_train = StandardScaler().fit_transform(df_train.drop(['target_flag'],axis=1))
X_test = StandardScaler().fit_transform(df_test.drop(['target_flag'],axis=1))

### LogisticRegression

In [28]:
X_train = df_train.drop(['target_flag'],axis=1)
y_train = df_train['target_flag']
#X = X.to_numpy()

log = LogisticRegression(max_iter=10000)

#### Попробуй запусти на 2.5 часа

kf = KFold(len(y),n_splits=10,shuffle=False)
y_pred = y.copy()

for train_index, test_index in tqdm(kf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train = y[train_index]
        
    log.fit(X_train,y_train)
    
    solver = ['liblinear']
    penalty = ['l1', 'l2']
    C = np.logspace(0, 4, 10)
    hyperparameters = dict(C=C, penalty=penalty,solver=solver)
        
    clf = GridSearchCV(log, hyperparameters, cv=10, verbose=0)
        
    best_model = clf.fit(X_train, y_train)

    y_pred[test_index] = best_model.predict(X_test)

penalty = ['l2']
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C, penalty=penalty)
        
clf = GridSearchCV(log, hyperparameters, cv=10, verbose=0)
        
best_model = clf.fit(X_train, y_train)

y_pred[test_index] = best_model.predict(X_test)

print(accuracy_score(y,y_pred))
print(f1_score(y,y_pred,average='weighted'))
print(precision_score(y,y_pred,average='weighted'))
print(recall_score(y,y_pred,average='weighted'))
print(r2_score(y,y_pred))

### Lasso LogisticRegression

In [39]:
X_train = df_train.drop(['target_flag'],axis=1)
y_train = df_train['target_flag']
X_test = df_test.drop(['target_flag'],axis=1)
y = df_test['target_flag']

In [33]:
log = LogisticRegression( penalty='l1', solver='liblinear')
log.fit(X_train,y_train)

LogisticRegression(penalty='l1', solver='liblinear')

In [34]:
y_pred_Lasso = log.predict(X_test)

In [35]:
accuracy_score(df_test[['target_flag']], y_pred_Lasso)

0.733

In [36]:
r2_score(df_test[['target_flag']], y_pred_Lasso)

-0.07046635448072336

In [37]:
average_precision_score = average_precision_score(df_test[['target_flag']], y_pred_Lasso)
average_precision_score

0.6865872500740783

In [40]:
print(accuracy_score(y,y_pred_Lasso))
print(f1_score(y,y_pred_Lasso,average='weighted'))
print(precision_score(y,y_pred_Lasso,average='weighted'))
print(recall_score(y,y_pred_Lasso,average='weighted'))
print(r2_score(y,y_pred_Lasso))

0.733
0.7327037131882203
0.7328324775792427
0.733
-0.07046635448072336


### SVC

In [41]:
from sklearn.svm import SVC
from sklearn import svm

In [42]:
clf = svm.SVC()

In [43]:
clf.fit(df_train.drop(['cl_id'], axis=1), df_train['cl_id'])

SVC()

svc = SVC(kernel="poly", degree=3, coef0=1, C=5)
svc.fit(df_train.drop(['cl_id'], axis=1), df_train['cl_id'])

In [44]:
y_pred_CVM = clf.predict(df_test.drop(['cl_id'], axis=1))

In [45]:
y_pred_CVM

array([ 8711,  6261,  6706,  7733,   722,  7573,  4480,  6921,  7958,
        8568,  9658,  9920,  1077,  6337,  4366,  1223,  6300,  7494,
        2312,   498,  2096,  4292,  6576,  5234,  3934,  7984,  9679,
          59,  5650,  2237,  3729,  4028,  2285,  3863,  3228,  4684,
        6605,  4107,  6984,  2237,  2854,  2381,  6246,  3822,  4348,
        5977,  5650,   722,  3925,  7956,  6250,  1113,  2066,  1645,
         836,  1441,  2238,  9930,   794,  1086,  2963,  2508,  3159,
         836,  4280,  4280,  4444,  1553,   836,  4468,  2666,  9224,
        2396,   750,  6674,   789,  2224,  1539,  2191,  5237,  2776,
        2224,  2540,  2688,   927,  2693,  5261,  9608,  2380,  9403,
         948,  2066,   836,  5122,  6281,  4862,  7285,  7733,   836,
        6469,  5261,  2188,  8475,  2188,  7742,  1539,  3423,  5438,
        2098,  2383,  2452,  4222,   876,  7256,  2339,  2486,  4731,
          61,  2486,  4329,  4261,  2191,  1213,  1434,   960, 10062,
        1583,  8234,

print(accuracy_score(y,y_pred_CVM))
print(f1_score(y,y_pred_CVM,average='weighted'))
print(precision_score(y,y_pred_CVM,average='weighted'))
print(recall_score(y,y_pred_CVM,average='weighted'))
print(r2_score(y,y_pred_CVM))

In [578]:
rbf_kernel_svm_clf = SVC(kernel="rbf", gamma=5, C=0.001)
rbf_kernel_svm_clf.fit(X_train, y_train)

SVC(C=0.001, gamma=5)

In [None]:
y_pred_rbf = clf.predict(X_test)

In [None]:
y_pred_rbf

## Polynomial kernel

In [34]:
df_train

Unnamed: 0,cl_id,num_days_in_use,trx_category_woe,MCC_woe,channel_type_woe,num_deals,sum_deals,target_flag,trx_category_6_amount_sum,trx_category_6_amount_mean,trx_category_6_amount_std,trx_category_6_amount_median,trx_category_6_amount_max,trx_category_6_amount_min,trx_category_6_amount_count,trx_category_5_amount_sum,trx_category_5_amount_mean,trx_category_5_amount_std,trx_category_5_amount_median,trx_category_5_amount_max,trx_category_5_amount_min,trx_category_5_amount_count,trx_category_2_amount_sum,trx_category_2_amount_mean,trx_category_2_amount_std,trx_category_2_amount_median,trx_category_2_amount_max,trx_category_2_amount_min,trx_category_2_amount_count,trx_category_1_amount_sum,trx_category_1_amount_mean,trx_category_1_amount_std,trx_category_1_amount_median,trx_category_1_amount_max,trx_category_1_amount_min,trx_category_1_amount_count,trx_category_9_amount_sum,trx_category_9_amount_mean,trx_category_9_amount_std,trx_category_9_amount_median,trx_category_9_amount_max,trx_category_9_amount_min,trx_category_9_amount_count,trx_category_4_amount_sum,trx_category_4_amount_mean,trx_category_4_amount_std,trx_category_4_amount_median,trx_category_4_amount_max,trx_category_4_amount_min,trx_category_4_amount_count,trx_category_8_amount_sum,trx_category_8_amount_mean,trx_category_8_amount_std,trx_category_8_amount_median,trx_category_8_amount_max,trx_category_8_amount_min,trx_category_8_amount_count,trx_category_0_amount_sum,trx_category_0_amount_mean,trx_category_0_amount_std,trx_category_0_amount_median,trx_category_0_amount_max,trx_category_0_amount_min,trx_category_0_amount_count,trx_category_7_amount_sum,trx_category_7_amount_mean,trx_category_7_amount_std,trx_category_7_amount_median,trx_category_7_amount_max,trx_category_7_amount_min,trx_category_7_amount_count,trx_category_3_amount_sum,trx_category_3_amount_mean,trx_category_3_amount_std,trx_category_3_amount_median,trx_category_3_amount_max,trx_category_3_amount_min,trx_category_3_amount_count,month,days_since2015
0,0,53,0.014712,-0.676699,-0.185776,5,-24383.000000,0,-7821.000000,-2607.000000,2185.684332,-2031.0,-767.00,-5023.00,3,20000.0,20000.000000,0.000000,20000.0,20000.0,20000.0,1.0,-36562.0,-36562.0,0.00000,-36562.0,-36562.0,-36562.0,1.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.400000,1031.800000
1,11,91,0.014712,-0.152487,-0.185776,217,-69407.585606,0,-251407.585606,-1214.529399,4067.684658,-303.0,-4.00,-38155.00,207,182000.0,18200.000000,19164.782516,15000.0,60000.0,1000.0,10.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.516129,943.202765
2,14,92,0.014712,0.392287,-0.185776,136,-143290.170000,1,-258990.170000,-1962.046742,9329.198216,-346.5,-25.00,-104521.00,132,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,152700.0,50900.0,48576.022892,40000.0,104000.0,8700.0,3.0,-37000.0,-37000.000000,0.000000,-37000.0,-37000.0,-37000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.713235,918.529412
3,21,88,0.014712,-0.152487,-0.185776,124,-101325.650000,0,-220103.650000,-1947.819912,4335.164033,-580.0,-29.00,-28400.00,113,112000.0,14000.000000,6590.035768,11500.0,27000.0,7000.0,8.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,4001.0,2000.5,2827.720018,2000.5,4000.0,1.0,2.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,2777.0,2777.0,0.0,2777.0,2777.0,2777.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.080645,714.362903
4,22,66,0.014712,-0.006393,-0.185776,59,-4567.000000,0,-110267.000000,-2450.377778,4737.401293,-364.0,-20.58,-17985.50,45,122600.0,15325.000000,20145.595052,10500.0,61500.0,100.0,8.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,-3500.0,-3500.000000,0.000000,-3500.0,-3500.0,-3500.0,1.0,-500.0,-250.0,0.0,-250.0,-250.0,-250.0,2.0,-9900.0,-4950.0,4171.930009,-4950.0,-2000.0,-7900.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3000.0,-3000.0,0.0,-3000.0,-3000.0,-3000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.169492,871.237288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,10207,92,0.014712,-0.152487,-0.576404,371,936176.366413,1,-398773.433587,-1242.284840,3478.613580,-455.0,-1.01,-39194.66,321,1546000.0,36809.523810,32619.377822,26000.0,137000.0,2500.0,42.0,-1391.0,-1391.0,0.00000,-1391.0,-1391.0,-1391.0,1.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,-210000.0,-35000.000000,8366.600265,-40000.0,-20000.0,-40000.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,340.8,340.8,0.0,340.8,340.8,340.8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.460916,972.460916
3996,10210,87,0.014712,-0.152487,-0.576404,114,-129513.590000,1,-70713.590000,-686.539709,1531.553563,-242.0,-9.00,-12528.00,103,55000.0,18333.333333,2886.751346,20000.0,20000.0,15000.0,3.0,-81800.0,-16360.0,16023.35795,-19000.0,-1000.0,-40000.0,5.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,-32000.0,-10666.666667,12503.332889,-5000.0,-2000.0,-25000.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.640351,705.684211
3997,10212,70,-0.813220,-0.330176,-0.576404,14,-160900.000000,0,-259900.000000,-37128.571429,23031.479493,-29500.0,-5000.00,-75400.00,7,99000.0,14142.857143,16777.252173,10000.0,50000.0,2000.0,7.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.000000,899.357143
3998,10213,84,0.014712,-0.152487,-0.576404,74,-8387.580000,0,-336887.580000,-5433.670645,8165.642901,-1165.0,-90.00,-30103.81,62,214500.0,26812.500000,11952.099337,27500.0,40000.0,6000.0,8.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,120000.0,40000.0,0.000000,40000.0,40000.0,40000.0,3.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6000.0,-6000.0,0.000000,-6000.0,-6000.0,-6000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.662162,887.067568


In [35]:
df_test

Unnamed: 0,cl_id,num_days_in_use,trx_category_woe,MCC_woe,channel_type_woe,num_deals,sum_deals,target_flag,trx_category_6_amount_sum,trx_category_6_amount_mean,trx_category_6_amount_std,trx_category_6_amount_median,trx_category_6_amount_max,trx_category_6_amount_min,trx_category_6_amount_count,trx_category_5_amount_sum,trx_category_5_amount_mean,trx_category_5_amount_std,trx_category_5_amount_median,trx_category_5_amount_max,trx_category_5_amount_min,trx_category_5_amount_count,trx_category_9_amount_sum,trx_category_9_amount_mean,trx_category_9_amount_std,trx_category_9_amount_median,trx_category_9_amount_max,trx_category_9_amount_min,trx_category_9_amount_count,trx_category_2_amount_sum,trx_category_2_amount_mean,trx_category_2_amount_std,trx_category_2_amount_median,trx_category_2_amount_max,trx_category_2_amount_min,trx_category_2_amount_count,trx_category_0_amount_sum,trx_category_0_amount_mean,trx_category_0_amount_std,trx_category_0_amount_median,trx_category_0_amount_max,trx_category_0_amount_min,trx_category_0_amount_count,trx_category_8_amount_sum,trx_category_8_amount_mean,trx_category_8_amount_std,trx_category_8_amount_median,trx_category_8_amount_max,trx_category_8_amount_min,trx_category_8_amount_count,trx_category_7_amount_sum,trx_category_7_amount_mean,trx_category_7_amount_std,trx_category_7_amount_median,trx_category_7_amount_max,trx_category_7_amount_min,trx_category_7_amount_count,trx_category_1_amount_sum,trx_category_1_amount_mean,trx_category_1_amount_std,trx_category_1_amount_median,trx_category_1_amount_max,trx_category_1_amount_min,trx_category_1_amount_count,trx_category_4_amount_sum,trx_category_4_amount_mean,trx_category_4_amount_std,trx_category_4_amount_median,trx_category_4_amount_max,trx_category_4_amount_min,trx_category_4_amount_count,trx_category_3_amount_sum,trx_category_3_amount_mean,trx_category_3_amount_std,trx_category_3_amount_median,trx_category_3_amount_max,trx_category_3_amount_min,trx_category_3_amount_count,month,days_since2015
0,1,92,0.014712,0.392287,-0.185776,104,-116766.861687,0,-161766.861687,-1601.652096,6074.327093,-400.00,-6.00,-60000.00,101,95000.0,47500.000000,3535.533906,47500.0,50000.0,45000.0,2.0,-50000.0,-50000.000000,0.000000,-50000.0,-50000.0,-50000.0,1.0,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00,0.00,0.000000,0.00,0.00,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.951923,989.298077
1,5,92,0.014712,-0.152487,-0.185776,142,-261387.370511,1,-293498.990511,-2644.135050,5881.977894,-1250.00,-8.20,-56164.05,111,130000.0,32500.000000,5000.000000,30000.0,40000.0,30000.0,4.0,-66500.0,-4750.000000,4353.380649,-2750.0,-1000.0,-15000.0,14.0,-33878.38,-4234.797500,2762.985251,-4000.0,-1000.0,-10000.0,8.0,13990.00,13990.00,0.000000,13990.00,13990.00,13990.00,1.0,-6000.0,-3000.0,2828.427125,-3000.0,-1000.0,-5000.0,2.0,-5500.0,-2750.0,3181.980515,-2750.0,-500.0,-5000.0,2.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.661972,854.950704
2,9,89,0.014712,0.234269,-0.185776,39,-245415.090000,0,-12365.090000,-426.382414,1000.336429,-100.00,-39.00,-5000.00,29,5000.0,5000.000000,0.000000,5000.0,5000.0,5000.0,1.0,-435000.0,-87000.000000,14404.860291,-85000.0,-75000.0,-110000.0,5.0,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00,0.00,0.000000,0.00,0.00,0.00,0.0,-100000.0,-100000.0,0.000000,-100000.0,-100000.0,-100000.0,1.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,296950.0,98983.333333,16227.317503,92500.0,117450.0,87000.0,3.0,0.00,0.000000,0.000000,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.974359,896.666667
3,10,89,0.014712,-0.152487,-0.185776,463,-251743.990000,0,-200956.110000,-537.315802,988.278994,-230.75,-3.00,-11700.00,374,426300.0,17762.500000,11378.729680,15500.0,35000.0,1000.0,24.0,-422300.0,-10828.205128,33074.476952,-1000.0,-100.0,-150000.0,39.0,-21692.90,-1141.731579,3204.689372,-370.0,-100.0,-14342.9,19.0,0.00,0.00,0.000000,0.00,0.00,0.00,0.0,-5400.0,-1800.0,1915.724406,-900.0,-500.0,-4000.0,3.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,10000.0,10000.000000,0.000000,10000.0,10000.0,10000.0,1.0,-37694.98,-12564.993333,1356.542738,-13290.0,-11000.0,-13404.98,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.976242,956.423326
4,20,73,0.014712,-0.255908,-0.185776,77,-7308.090000,0,-222308.090000,-3004.163378,4762.677065,-1229.00,-68.00,-23583.00,74,215000.0,71666.666667,10408.329997,75000.0,80000.0,60000.0,3.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00,0.00,0.000000,0.00,0.00,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.363636,850.935065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,10166,68,0.014712,0.354610,-0.576404,70,-2406.111024,1,-197406.111024,-2946.359866,5875.350324,-839.00,-5.60,-28634.00,67,220000.0,110000.000000,14142.135624,110000.0,120000.0,100000.0,2.0,-25000.0,-25000.000000,0.000000,-25000.0,-25000.0,-25000.0,1.0,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00,0.00,0.000000,0.00,0.00,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.071429,721.657143
996,10181,89,0.014712,-0.152487,-0.576404,177,-95906.160962,0,-221286.110962,-1349.305555,2035.481252,-789.70,-4.57,-15819.00,164,130850.0,14538.888889,16747.047833,5000.0,50000.0,1000.0,9.0,-5500.0,-1833.333333,1892.969449,-1000.0,-500.0,-4000.0,3.0,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,29.95,29.95,0.000000,29.95,29.95,29.95,1.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.225989,963.423729
997,10184,90,0.014712,-0.255908,-0.576404,46,-797422.110000,1,-430956.960000,-14860.584828,17186.718752,-7065.80,-773.26,-66049.28,29,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,-445000.0,-37083.333333,34736.562695,-20000.0,-5000.0,-100000.0,12.0,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,78534.85,15706.97,22866.258863,3942.00,55311.05,1062.00,5.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.195652,869.630435
998,10190,2,0.289747,-0.006393,-0.576404,6,-941620.000000,0,-1620.000000,-1620.000000,0.000000,-1620.00,-1620.00,-1620.00,1,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,-940000.0,-188000.000000,21389.249636,-195000.0,-150000.0,-200000.0,5.0,0.00,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00,0.00,0.000000,0.00,0.00,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00,0.000000,0.000000,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.000000,846.000000


In [28]:
from sklearn.svm import SVC

In [29]:
poly_kernel_svm = SVC(kernel="poly")

In [30]:
poly_kernel_svm.fit(df_train.drop(['cl_id'], axis=1), df_train['cl_id'])

SVC(kernel='poly')

In [32]:
y_pred_poly_kernel = poly_kernel_svm.predict(df_test.drop(['cl_id'], axis=1))

In [33]:
y_pred_poly_kernel

array([ 7415,  1100,  6917,  9925,  6261,  9066,   722,  5650,  9565,
        7721,  5224,  9509,  2493,   789,  9619,   545,  8337,  1665,
        1441,  5082,  3366,  3956,  7862,  2260,    91,  4382,  2878,
        9780,  6483,  5781,  4292,  2752,  3487,   903,  9780,  7715,
        5729,  4260,  6984,  7345,  8552,  3228,  8553,  2285,  6709,
        3351,  5798,  3228,  4382,  9241,  3448,  4747,  2237,  6682,
        1772,  4428,   216,  9489,  5650,  4348,   878,   343,  3366,
         773,  2383,  1622,   699,  2383,  5439,  5438,  1408,  3366,
        3108,   461,  7861,  1806,  3094,  7014,  7596,  1539,   885,
        2100,  5439,    81,   789,  2956,   750,   773,  3366,  9477,
        4788,  7949,  2647,   635,  2143,  7014,  3851,  4659, 10041,
        6674,  2066,   635,  2540,  6389,  8119,  1517,  7285,   357,
        5439,  3094,  7596,  3094,  2676,  9977,  9977,  5547,  4149,
        4689,   722,  1622,  4456,  2216,   219,  4565,  5068,  6885,
        4479,   804,

#### LGBModel

In [411]:
from sklearn.model_selection import RepeatedStratifiedKFold
import lightgbm as lgb
skf = RepeatedStratifiedKFold(n_splits=8, n_repeats=5, random_state=201805)

In [412]:
lgb_data_train = lgb.Dataset(df_train.drop(['target_flag'],axis=1), 
                             df_train['target_flag'], 
                             free_raw_data=False
                            )

params = {
    'objective':'binary', 
    'metric': 'auc',
    'learning_rate': 0.01, 
    'random_state':4242442,
    'subsample':0.33,
    'class_weight':'balanced',
    'colsample_bytree':0.33,
    'reg_lambda':4
}

h = lgb.cv(params, 
           lgb_data_train, 
           num_boost_round=10000,
           early_stopping_rounds=50, 
           verbose_eval=10, 
           folds=skf.split(df_train, df_train['target_flag'])
          )

[LightGBM] [Info] Number of positive: 1918, number of negative: 1582
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13113
[LightGBM] [Info] Number of data points in the train set: 3500, number of used features: 72
[LightGBM] [Info] Number of positive: 1918, number of negative: 1582
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13113
[LightGBM] [Info] Number of data points in the train set: 3500, number of used features: 72
[LightGBM] [Info] Number of positive: 1918, number of negative: 1582
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13113
[LightGBM] [Info] Number of data points in the train set: 3500, number of used features: 72
[LightGBM] [Info] Number of positive: 1918, number of negative: 1582
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force

[LightGBM] [Info] Number of positive: 1918, number of negative: 1582
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13113
[LightGBM] [Info] Number of data points in the train set: 3500, number of used features: 72
[LightGBM] [Info] Number of positive: 1918, number of negative: 1582
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13113
[LightGBM] [Info] Number of data points in the train set: 3500, number of used features: 72
[LightGBM] [Info] Number of positive: 1918, number of negative: 1582
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13113
[LightGBM] [Info] Number of data points in the train set: 3500, number of used features: 72
[LightGBM] [Info] Number of positive: 1918, number of negative: 1582
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13113
[LightGBM] [

[LightGBM] [Info] Number of positive: 1918, number of negative: 1582
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13113
[LightGBM] [Info] Number of data points in the train set: 3500, number of used features: 72
[LightGBM] [Info] Number of positive: 1918, number of negative: 1582
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13113
[LightGBM] [Info] Number of data points in the train set: 3500, number of used features: 72
[LightGBM] [Info] Number of positive: 1918, number of negative: 1582
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13113
[LightGBM] [Info] Number of data points in the train set: 3500, number of used features: 72
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.548000 -> initscore=0.192593
[LightGBM] [Info] Start trainin

KeyboardInterrupt: 

## Домашняя работа 2

1. **Обязательная часть**: на том же наборе данных провести feature engineering:
    - создать Weights of Evidence (WOE) фичи на тех переменных, где категорий большое количество
    - cделать one hot encoding там, где необходимо
    - по желанию посчитать каунты (counts)
    
2. **Обязательная часть**: обучить linear SVM и SVM с нелинейным ядром - сравнить время обучения и результат метрики оценки качества (ROC AUC) двух алгоритмов между собой и сравнить с линейной моделью (лог.рег)