In [3]:
import pandas as pd
import numpy as np

In [4]:
import os

filelist = os.listdir()

src_applications = []
src_others = []
src_money = []
src_delays = []

for i in filelist:
    try:
        tmp = pd.read_csv(i)
    except:
        continue
    if 'applications' in i:
        src_applications.append(tmp)
    elif 'others' in i:
        src_others.append(tmp)
    elif 'money' in i:
        src_money.append(tmp)
    elif 'delays' in i:
        src_delays.append(tmp)

In [5]:
def compare_two_sets(df1, df2): #df1 - первый датасет, df2 - следующий за ним
    for row in range(len(df1)):
#         print(df1.loc[row, 'client_id'])
        next_pol = list(df2.client_id.values)
        if df1.loc[row, 'client_id'] not in next_pol:
            df1.drop(row, inplace=True)
    df1 = df1.reset_index(drop=True) 
    return df1

In [6]:
def drop_useless(df):
    try:
        del df['avg_ovd_amount_6m']
    except:
        pass
    
    try:
        del df['bank_ki_flg']
    except:
        pass
    
    try:
        del df['client_id']
    except:
        pass
    try:
        del df['outstanding']
    except:
        pass

In [7]:
def generate_data(src_df):
    final = []
    for i in range(len(src_df)-1):
        cur = src_df[i]
        nex = src_df[i+1]
        cols = list(nex.columns)
        cols.remove('cnt_opened_6m')
        cols.remove('client_id')

        temp_data = compare_two_sets(cur, nex)
        del temp_data['cnt_opened_6m']
        nex.drop(columns=cols, inplace=True)
        temp_data = temp_data.join(nex.set_index('client_id'), on='client_id')
        drop_useless(temp_data)
        final.append(temp_data)
        
    fmon = final[0]
    for i in range(len(final)):
        fmon = pd.concat([fmon, final[i]])
        
    fmon.dropna(how='any', inplace=True)
    return fmon

In [8]:
src_applications = []
src_others = []
src_money = []
src_delays = []

for i in filelist:
    try:
        tmp = pd.read_csv(i)
    except:
        continue
    if 'applications' in i:
        src_applications.append(tmp)
    elif 'others' in i:
        src_others.append(tmp)
    elif 'money' in i:
        src_money.append(tmp)
    elif 'delays' in i:
        src_delays.append(tmp)

In [9]:
money = generate_data(src_money)
others = generate_data(src_others)
delays = generate_data(src_delays)
applications = generate_data(src_applications)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor

In [11]:
def get_tree_model(df):
    X = df.drop(columns=['cnt_opened_6m'])
    y = df['cnt_opened_6m']
    df_x_train, df_x_test, df_y_train, df_y_test = train_test_split(X, y, test_size=0.05, random_state=42)
    
    model = DecisionTreeRegressor()
    model.fit(df_x_train, df_y_train)
    pred = model.predict(df_x_test)

    rmse = np.sqrt(mean_squared_error(df_y_test,pred))
    r2 = r2_score(df_y_test,pred)
    print(f'rmse = {rmse}')
    print(f'r2 = {r2}')
    return model

In [12]:
cols_for_money = [ 'avg_utilization_6m','sum_dep_income_6month','sum_dep_expense_6month','sum_sal_acc_inc_6month_amt_rur']
    
cols_for_others = ['mrtg_open', 'cl_open', 'auto_open', 'card_open', 'micro_open', 'other_open']
    
cols_for_applications = ['app_cc_cnt','app_potreb_cnt','app_mrtg_cnt']
    
cols_for_delays = ['bank_1_29_5y_debtor','bank_30_59_5y_debtor','bank_60_89_5y_debtor','bank_90_119_5y_debtor','bank_120plus_5y_debtor']

common = ['age','gender','cnt_closed','length_total', 'total_curr_payment_rkk','app_real_estate_ind','app_vehicle_ind','cnt_inquiry_last6m', 'ratio_cl_closed']

In [13]:
# эффективность svm = 20 %

# models = {
#     'money' : get_svm_model(money),
#     'others' : get_svm_model(others),
#     'delays' : get_svm_model(delays),
#     'applications' : get_svm_model(applications)
# }

In [14]:
models = {
    'money' : get_tree_model(money),
    'others' : get_tree_model(others),
    'delays' : get_tree_model(delays),
    'applications' : get_tree_model(applications)
}

rmse = 0.0
r2 = 1.0
rmse = 0.0
r2 = 1.0
rmse = 0.1767766952966369
r2 = 0.8929682017189688
rmse = 0.05892556509887896
r2 = 0.9881075779687744


In [15]:
def assess(line):
    taken = line['cnt_opened_6m']
#     print(f'taken = {taken}')
    line = line.to_dict()
    line.pop('client_id')
    line.pop('report_date')
    line.pop('cnt_opened_6m')
    line = pd.Series(line)
    measurements = []
    analysis_flag = False
    variants = dict()
    mcheck = line[cols_for_money+common].isna().sum()
    if mcheck == 0:
        msrc = line[cols_for_money+common]
        analysis_flag = True
        variants['money']=variants.get('money', msrc)
        
    ocheck = line[cols_for_others+common].isna().sum()
    if ocheck == 0:
        osrc = line[cols_for_others+common]
        analysis_flag = True
        variants['others']=variants.get('others', osrc)
        
    acheck = line[cols_for_applications+common].isna().sum()
    if acheck == 0:
        asrc = line[cols_for_applications+common]
        analysis_flag = True
        variants['applications']=variants.get('applications', asrc)
        
    dcheck = line[cols_for_delays+common].isna().sum()
    if dcheck == 0:
        dsrc = line[cols_for_delays+common]
        analysis_flag = True
        variants['delays']=variants.get('delays', dsrc)
    
    if variants == dict():
        return 0
    else:
        for i in list(variants.keys()):
            measurements.append(models[i].predict(np.array(variants[i]).reshape(1, -1)))
            break
    return np.mean(measurements)

In [16]:
df = pd.read_csv('test.csv')

In [17]:
df.dropna(how='any', inplace=True)
df.reset_index(inplace=True)
del df['index']
df

Unnamed: 0,client_id,report_date,avg_ovd_amount_6m,avg_utilization_6m,total_num_cards,cnt_active,last6m_opened,last6m1y_opened,sum_dep_income_1month,sum_dep_expense_1month,sum_dep_income_6month,sum_dep_expense_6month,gender,app_real_estate_ind,app_vehicle_ind,app_cc_cnt,app_potreb_cnt,app_mrtg_cnt,app_lst_request_m_term,sum_sal_acc_inc_6month_amt_rur,cnt_opened,mrtg_open,cl_open,auto_open,card_open,micro_open,other_open,cnt_opened_6m,cnt_opened_1y,cnt_closed,mrtg_closed,cl_closed,auto_closed,card_closed,micro_closed,other_closed,length_bank,length_bank_total,length_total,outstanding,cnt_inquiry_last6m,total_curr_payment_rkk,bank_ki_flg,age,ratio_cl_closed,ratio_cl_closed_gr100k,bank_1_29_5y_debtor,bank_30_59_5y_debtor,bank_60_89_5y_debtor,bank_90_119_5y_debtor,bank_120plus_5y_debtor
0,7808,2019-11-01,0.0,0.070,2.0,1.0,1.0,0.0,1.5402e+04,1.5402e+04,1.6092e+06,1.5938e+06,1.0,1.0,0.0,10.0,3.0,2.0,1.290,5.8634e+05,4.0,1.0,1.0,0.0,2.0,0.0,0.0,2.0,2.0,8.0,1.0,2.0,4.0,1.0,0.0,0.0,105.4292,105.4292,112.8542,1.6793e+06,35.0,34162.0994,1.0,42.0780,0.5000,1.0000,0.0,0.0,0.0,0.0,0.0
1,6760,2020-09-01,0.0,0.000,6.0,3.0,1.0,0.0,2.8346e+02,0.0000e+00,1.8436e+06,1.8125e+06,0.0,0.0,0.0,3.0,3.0,0.0,1.387,7.9877e+05,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,9.0,0.0,7.0,0.0,2.0,0.0,0.0,98.9897,92.3532,128.1971,1.5137e+06,3.0,41650.1014,1.0,31.6237,0.7143,1.0000,0.0,0.0,0.0,0.0,0.0
2,8750,2019-03-01,0.0,0.000,2.0,1.0,1.0,0.0,3.4155e+03,0.0000e+00,9.5746e+05,6.7599e+05,1.0,0.0,0.0,0.0,2.0,0.0,6.129,4.8854e+05,2.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,4.7310,4.7310,54.5708,1.1078e+06,0.0,29673.7819,1.0,29.0000,1.0000,1.0000,0.0,0.0,0.0,0.0,0.0
3,8192,2020-04-01,0.0,0.043,7.0,3.0,0.0,2.0,9.8332e+05,9.8214e+05,1.0594e+06,2.1242e+06,1.0,0.0,0.0,0.0,1.0,0.0,20.484,3.7412e+05,2.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,5.0,0.0,2.0,0.0,3.0,0.0,0.0,124.7803,119.4908,145.1828,5.5056e+03,1.0,275.2810,1.0,43.8763,0.5000,1.0000,0.0,0.0,0.0,0.0,0.0
4,8562,2020-04-01,0.0,0.133,7.0,3.0,2.0,1.0,1.5100e+04,1.5092e+04,9.0500e+04,9.0593e+04,0.0,0.0,0.0,0.0,2.0,0.0,24.097,3.3332e+05,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,7.0,0.0,2.0,1.0,4.0,0.0,0.0,82.9569,82.9569,98.8912,4.3968e+05,6.0,22776.4031,1.0,39.1801,0.0000,0.0000,13.0,0.0,0.0,0.0,0.0
5,8567,2020-01-01,0.0,0.017,3.0,2.0,0.0,1.0,9.4246e+03,9.4246e+03,5.7579e+04,5.7579e+04,0.0,1.0,0.0,0.0,1.0,2.0,13.323,1.3225e+06,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,12.0,1.0,4.0,1.0,6.0,0.0,0.0,117.2238,117.2238,146.8912,0.0000e+00,0.0,21300.0000,1.0,45.6183,0.5000,0.5000,1.0,0.0,0.0,0.0,0.0
6,2677,2019-02-01,0.0,0.164,5.0,3.0,1.0,2.0,0.0000e+00,1.0738e+05,2.4562e+03,1.0738e+05,1.0,0.0,0.0,0.0,3.0,1.0,5.581,9.4429e+05,2.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,9.0,1.0,2.0,0.0,6.0,0.0,0.0,108.3860,108.3860,108.3860,2.2943e+06,0.0,38655.6921,1.0,35.9301,1.0000,1.0000,1.0,0.0,0.0,0.0,0.0
7,5660,2019-02-01,0.0,0.040,3.0,2.0,0.0,1.0,1.2590e+05,1.2587e+05,2.3464e+06,2.3463e+06,0.0,0.0,0.0,0.0,2.0,2.0,4.226,8.4556e+05,3.0,1.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,8.0,1.0,2.0,2.0,3.0,0.0,0.0,170.3819,170.3819,170.3819,1.8544e+06,8.0,30373.8090,1.0,38.4140,1.0000,1.0000,1.0,0.0,0.0,0.0,0.0
8,7240,2019-04-01,0.0,0.002,3.0,2.0,2.0,0.0,6.0000e+05,6.0000e+05,6.0000e+05,6.0000e+05,0.0,0.0,0.0,0.0,3.0,0.0,0.645,3.1306e+05,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,5.0,0.0,4.0,0.0,1.0,0.0,0.0,90.8747,90.8747,94.8501,6.0000e+05,1.0,13206.8080,1.0,52.2124,0.5000,0.6667,0.0,0.0,0.0,0.0,0.0
9,6289,2019-06-01,0.0,0.000,3.0,2.0,1.0,2.0,1.8873e+05,1.8873e+05,1.9015e+06,1.9015e+06,0.0,1.0,0.0,0.0,1.0,0.0,83.161,6.2642e+05,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,8.0,0.0,5.0,1.0,2.0,0.0,0.0,107.3018,107.3018,107.3018,2.0000e+06,1.0,82349.5235,1.0,33.7016,0.6000,0.5000,0.0,0.0,0.0,0.0,0.0


In [18]:
ans = []

for i in range(len(df)):
    ans.append(assess(df.loc[i]))

results = 0
check = list(zip(ans, df['cnt_opened_6m']))
for i in check:
    my = i[0]
    real = i[1]
    if my!=0 and real != 0:
        results+=my/real
    elif my==0 and real == 0:
        results+=1
    elif my==0 and real != 0:
        results+=0
    elif my!=0 and real == 0:
        results+=0
    
results/=len(df)
results

0.6002559148532302