In [0]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from pathlib import Path
from sklearn.metrics import roc_auc_score, f1_score
from joblib import dump, load
np.random.seed(42)

In [0]:
# Загружаем данные
data_info_all = pd.read_csv(Path('drive/My Drive/S-Industrial/data/inn_info_public.csv'))
data_pays = pd.read_csv(Path('drive/My Drive/S-Industrial/data/pays.csv'))

### Features generation ###


In [0]:
# ОБЩАЯ СУММА ПЕРЕВОДОВ (kt)
data_pays['total_sum_kt'] = data_pays.groupby(['hash_inn_kt'])["sum"].transform("sum")

# ОБЩАЯ СУММА "ПОЛУЧЕНИЙ" (dt)
data_pays['total_sum_dt'] = data_pays.groupby(['hash_inn_dt'])["sum"].transform("sum")

# ОБЩЕЕ ЧИСЛО ПЕРЕВОДОВ 
data_pays['nums_of_kt'] = data_pays.groupby(['hash_inn_kt'])["count"].transform("count")

# ОБЩЕЕ ЧИСЛО ПОЛУЧЕНИЙ
data_pays['nums_of_dt'] = data_pays.groupby(['hash_inn_dt'])["count"].transform("count")

In [0]:
# Посчитаем средний размер транзакции 
data_pays['mean_pay'] = data_pays['sum']/ data_pays['count']

In [0]:
# СОЗДАЁМ dict, В КОТОРЫХ КЛЮЧ - ИНН, ЗНАЧЕНИЕ - ОБЩАЯ СУММА ПЕРЕВОДОВ (kt)
dict_total_sum_kt = pd.Series(data_pays.total_sum_kt.values,index=data_pays.hash_inn_kt).to_dict()
# СОЗДАЁМ dict, В КОТОРЫХ КЛЮЧ - ИНН, ЗНАЧЕНИЕ - ОБЩАЯ СУММА "ПОЛУЧЕНИЙ" (dt)
dict_total_sum_dt = pd.Series(data_pays.total_sum_dt.values,index=data_pays.hash_inn_dt).to_dict()

In [0]:
# СОЗДАЁМ dict, В КОТОРЫХ КЛЮЧ - ИНН, ЗНАЧЕНИЕ - ОБЩАЯ ЧИСЛО ПЕРЕВОДОВ 
dict_nums_of_kt = pd.Series(data_pays.nums_of_kt.values,index=data_pays.hash_inn_kt).to_dict()
# СОЗДАЁМ dict, В КОТОРЫХ КЛЮЧ - ИНН, ЗНАЧЕНИЕ - ОБЩeЯ ЧИСЛО ПОЛУЧЕНИЙ 
dict_nums_of_dt = pd.Series(data_pays.nums_of_dt.values,index=data_pays.hash_inn_dt).to_dict()

In [0]:
# СОЗДАЁМ dict, В КОТОРЫХ КЛЮЧ - ИНН, ЗНАЧЕНИЕ - СРЕДНИЙ РАЗМЕР ТРАНЗАКЦИИ  
dict_mean_pay = pd.Series(data_pays.mean_pay.values,index=data_pays.hash_inn_kt).to_dict()

In [0]:
# Перенесём полученные фичи в dataframe data_info_all

data_info_all['total_sum_kt'] = data_info_all['hash_inn'].map(dict_total_sum_kt)
data_info_all['total_sum_dt'] = data_info_all['hash_inn'].map(dict_total_sum_dt)
data_info_all['nums_of_kt'] = data_info_all['hash_inn'].map(dict_nums_of_kt)
data_info_all['nums_of_dt'] = data_info_all['hash_inn'].map(dict_nums_of_dt)
data_info_all['mean_pay'] = data_info_all['hash_inn'].map(dict_mean_pay)
data_info_all = pd.merge(data_pays[['hash_inn_kt','week']], data_info_all, left_on='hash_inn_kt', right_on='hash_inn', how='right')


In [0]:
# Разделим данные: есть таргет
data_info_train = data_info_all[data_info_all.is_public == True]
# /нет таргета
data_info_to_predict = data_info_all[data_info_all.is_public == False]

In [0]:
# Nan -> -1
data_info_train = data_info_train.fillna(-1)
data_info_to_predict = data_info_to_predict.fillna(-1)

In [0]:
X_tr = data_info_train[['total_sum_kt', 'total_sum_dt','mean_pay', 'nums_of_kt', 'nums_of_dt']]
y_tr = data_info_train[['okved2']].astype(int)

sc= StandardScaler()
X_tr = sc.fit_transform(X_tr)

X_train, X_test, y_train, y_test = train_test_split(X_tr, y_tr, test_size=0.2, random_state=1)

In [12]:
clf = RandomForestClassifier(8, random_state=42, verbose=3, n_jobs=-1)
clf.fit(X_train, y_train)

  
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 8building tree 2 of 8

building tree 3 of 8
building tree 4 of 8
building tree 5 of 8
building tree 6 of 8
building tree 7 of 8
building tree 8 of 8


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   53.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   53.7s finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=8, n_jobs=-1,
                       oob_score=False, random_state=42, verbose=3,
                       warm_start=False)

In [18]:
# Оцениваем 
f1_score(y_test, clf.predict(X_test), average='weighted')

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   8 out of   8 | elapsed:    3.0s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   8 out of   8 | elapsed:    3.0s finished


0.980977964636035

In [19]:
# Обучаем на всех данных 
clf = RandomForestClassifier(8, random_state=42, verbose=3, n_jobs=-1)
clf.fit(X_tr, y_tr)

  
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 8building tree 2 of 8

building tree 3 of 8
building tree 4 of 8
building tree 5 of 8
building tree 6 of 8
building tree 7 of 8
building tree 8 of 8


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   35.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   35.9s finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=8, n_jobs=-1,
                       oob_score=False, random_state=42, verbose=3,
                       warm_start=False)

### Create submission ###

In [20]:
# Предсказываем на тестовой выборке 
X_to_predict = data_info_to_predict[['total_sum_kt', 'total_sum_dt','mean_pay', 'nums_of_kt', 'nums_of_dt']]
X_to_predict = sc.fit_transform(X_to_predict)

predicted_okved2 = clf.predict(X_to_predict)


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   8 out of   8 | elapsed:    5.7s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   8 out of   8 | elapsed:    5.7s finished


In [0]:
submission = pd.DataFrame({'hash_inn':data_info_to_predict['hash_inn'],'y':predicted_okved2})

In [22]:
filename = 'drive/My Drive/S-Industrial/data/StephanPushkov-10022020-SberbankIndustry.csv'

submission.to_csv(filename,index=False)

print('Saved file: ', filename)

Saved file:  drive/My Drive/S-Industrial/data/StephanPushkov-10022020-SberbankIndustry.csv


In [23]:
submission

Unnamed: 0,hash_inn,y
0,0,55
1,0,55
2,0,55
3,0,55
4,0,55
...,...,...
5518048,69928,12
5518051,164291,12
5518053,212835,12
5518055,156954,12


In [0]:
names = ['chislo_vseh_perevodov', 'chislo_vseh_recive','mean_pay',
       'region', 'sum_perevodov', 'sum_recive',
       'week']
sorted(zip(clf.feature_importances_.tolist(), names))

[(0.008944682166640667, 'week'),
 (0.08687014664809739, 'region'),
 (0.1542722149341164, 'mean_pay'),
 (0.17654695602037163, 'sum_perevodov'),
 (0.18571890917729972, 'chislo_vseh_recive'),
 (0.19317360189112573, 'sum_recive'),
 (0.1944734891623485, 'chislo_vseh_perevodov')]

In [0]:
# data_info_all_2 = pd.merge(data_pays[['hash_inn_dt','week']], data_info_all, left_on='hash_inn_dt', right_on='hash_inn', how='right')
# data_info_all = pd.concat([data_info_all_1, data_info_all_2], ignore_index=True)

In [0]:
# Посчитаем количесвто уникальных инн для каждого региона 
# data_info_all['unique_inn_in_region'] = data_info_all.groupby(['region'])["hash_inn"].transform("count")