In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from pathlib import Path
from sklearn.metrics import roc_auc_score, f1_score
from joblib import dump, load
np.random.seed(42)

In [2]:
# import warnings
# warnings.filterwarnings('ignore')

In [3]:
# Загружаем данные
data_info_all = pd.read_csv('inn_info_public.csv')
data_pays = pd.read_csv('pays.csv')

### Features generation ###

In [4]:
# Посчитаем количесвто уникальных инн для каждого региона 
data_info_all['unique_inn_in_region'] = data_info_all.groupby(['region'])["hash_inn"].transform("count")
# Посчитаем средний размер транзакции 
data_pays['mean_pay'] = data_pays['sum']/ data_pays['count']


In [5]:
# ОБЩАЯ СУММА ПЕРЕВОДОВ (kt)
data_pays['total_sum_kt'] = data_pays.groupby(['hash_inn_kt'])["sum"].transform("sum")

# ОБЩАЯ СУММА "ПОЛУЧЕНИЙ" (dt)
data_pays['total_sum_dt'] = data_pays.groupby(['hash_inn_dt'])["sum"].transform("sum")

# ОБЩЕЕ ЧИСЛО ПЕРЕВОДОВ 
data_pays['nums_of_kt'] = data_pays.groupby(['hash_inn_kt'])["count"].transform("count")

# ОБЩЕЕ ЧИСЛО ПОЛУЧЕНИЙ
data_pays['nums_of_dt'] = data_pays.groupby(['hash_inn_dt'])["count"].transform("count")

In [6]:
# СОЗДАЁМ dict, В КОТОРЫХ КЛЮЧ - ИНН, ЗНАЧЕНИЕ - ОБЩАЯ СУММА ПЕРЕВОДОВ (kt)
dict_total_sum_kt = pd.Series(data_pays.total_sum_kt.values,index=data_pays.hash_inn_kt).to_dict()
# СОЗДАЁМ dict, В КОТОРЫХ КЛЮЧ - ИНН, ЗНАЧЕНИЕ - ОБЩАЯ СУММА "ПОЛУЧЕНИЙ" (dt)
dict_total_sum_dt = pd.Series(data_pays.total_sum_dt.values,index=data_pays.hash_inn_dt).to_dict()

In [7]:
# СОЗДАЁМ dict, В КОТОРЫХ КЛЮЧ - ИНН, ЗНАЧЕНИЕ - ОБЩАЯ ЧИСЛО ПЕРЕВОДОВ 
dict_nums_of_kt = pd.Series(data_pays.nums_of_kt.values,index=data_pays.hash_inn_kt).to_dict()
# СОЗДАЁМ dict, В КОТОРЫХ КЛЮЧ - ИНН, ЗНАЧЕНИЕ - ОБЩeЯ ЧИСЛО ПОЛУЧЕНИЙ 
dict_nums_of_dt = pd.Series(data_pays.nums_of_dt.values,index=data_pays.hash_inn_dt).to_dict()

In [8]:
# СОЗДАЁМ dict, В КОТОРЫХ КЛЮЧ - ИНН, ЗНАЧЕНИЕ - СРЕДНИЙ РАЗМЕР ТРАНЗАКЦИИ  
dict_mean_pay = pd.Series(data_pays.mean_pay.values,index=data_pays.hash_inn_kt).to_dict()

In [9]:
# Перенесём полученные фичи в dataframe data_info_all

data_info_all['total_sum_kt'] = data_info_all['hash_inn'].map(dict_total_sum_kt)
data_info_all['total_sum_dt'] = data_info_all['hash_inn'].map(dict_total_sum_dt)
data_info_all['nums_of_kt'] = data_info_all['hash_inn'].map(dict_nums_of_kt)
data_info_all['nums_of_dt'] = data_info_all['hash_inn'].map(dict_nums_of_dt)
data_info_all['mean_pay'] = data_info_all['hash_inn'].map(dict_mean_pay)
data_info_all = pd.merge(data_pays[['hash_inn_kt','week']], data_info_all, left_on='hash_inn_kt', right_on='hash_inn', how='right')
# data_info_all_2 = pd.merge(data_pays[['hash_inn_dt','week']], data_info_all, left_on='hash_inn_dt', right_on='hash_inn', how='right')
# data_info_all = pd.concat([data_info_all_1, data_info_all_2], ignore_index=True)

In [10]:
# Разделим данные: есть таргет
data_info_train = data_info_all[data_info_all.is_public == True]
# /нет таргета
data_info_to_predict = data_info_all[data_info_all.is_public == False]

In [51]:
data_info_to_predict.isnull().sum()

hash_inn_kt              28688
week                     28688
hash_inn                     0
okved2                       0
region                       0
is_public                    0
unique_inn_in_region         0
total_sum_kt             28688
total_sum_dt            107807
nums_of_kt               28688
nums_of_dt              107807
mean_pay                 28889
dtype: int64

In [52]:
data_info_to_predict.shape

(1740337, 12)

In [11]:
data_info_train = data_info_train.fillna(-1)
data_info_to_predict = data_info_to_predict.fillna(-1)

In [12]:
sc= StandardScaler()
X_tr = data_info_train[['total_sum_kt', 'total_sum_dt','mean_pay', 'nums_of_kt', 'nums_of_dt']]
y_tr = data_info_train[['okved2']].astype(int)

X_tr = sc.fit_transform(X_tr)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [13]:
clf = RandomForestClassifier(8, random_state=42, verbose=3, n_jobs=-1)
clf.fit(X_tr, y_tr)

  
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 1 of 8
building tree 2 of 8
building tree 3 of 8building tree 4 of 8
building tree 5 of 8
building tree 6 of 8building tree 7 of 8

building tree 8 of 8



[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   14.3s remaining:   43.0s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:   14.6s remaining:    8.7s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   14.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   14.9s finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=8, n_jobs=-1,
                       oob_score=False, random_state=42, verbose=3,
                       warm_start=False)

In [40]:
# f1_score(y_test, clf.predict(X_test), average='micro')

NameError: name 'y_test' is not defined

### Create Submission ###

In [21]:

X_to_predict = data_info_to_predict[['total_sum_kt', 'total_sum_dt','mean_pay', 'nums_of_kt', 'nums_of_dt']]

X_to_predict = sc.fit_transform(X_to_predict)


In [23]:
X_to_predict

array([[-0.06888471, -0.12521327,  1.06429271, -0.51221818,  0.28603577],
       [-0.06888471, -0.12521327,  1.06429271, -0.51221818,  0.28603577],
       [-0.06888471, -0.12521327,  1.06429271, -0.51221818,  0.28603577],
       ...,
       [-0.30876146, -0.28410902, -0.09370244, -0.53607096, -0.54974701],
       [-0.30876146, -0.28421174, -0.09370244, -0.53607096, -0.55351179],
       [-0.30876146, -0.28353401, -0.09370244, -0.53607096, -0.5516294 ]])

In [None]:
submission = pd.DataFrame({'hash_inn':data_info_to_predict['hash_inn'],'y':predicted_okved2})

In [None]:
filename = 'StephanPushkov-9022020-SberbankIndustry.csv'

submission.to_csv(filename,index=False)

print('Saved file: ', filename)