In [11]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from pathlib import Path
from sklearn.metrics import roc_auc_score, f1_score
from joblib import dump, load
np.random.seed(42)

In [12]:
import warnings
warnings.filterwarnings('ignore')

In [13]:
# Загружаем данные
data_info_all = pd.read_csv('inn_info_public.csv')
data_pays = pd.read_csv('pays.csv')

### Features generation ###

In [14]:
# Посчитаем количесвто уникальных инн для каждого региона 
data_info_all['unique_inn_in_region'] = data_info_all.groupby(['region'])["hash_inn"].transform("count")
# Посчитаем средний размер транзакции 
data_pays['mean_pay'] = data_pays['sum']/ data_pays['count']


In [15]:
# ОБЩАЯ СУММА ПЕРЕВОДОВ (kt)
data_pays['total_sum_kt'] = data_pays.groupby(['hash_inn_kt'])["sum"].transform("sum")

# ОБЩАЯ СУММА "ПОЛУЧЕНИЙ" (dt)
data_pays['total_sum_dt'] = data_pays.groupby(['hash_inn_dt'])["sum"].transform("sum")

# ОБЩЕЕ ЧИСЛО ПЕРЕВОДОВ 
data_pays['nums_of_kt'] = data_pays.groupby(['hash_inn_kt'])["count"].transform("count")

# ОБЩЕЕ ЧИСЛО ПОЛУЧЕНИЙ
data_pays['nums_of_dt'] = data_pays.groupby(['hash_inn_dt'])["count"].transform("count")

In [16]:
# СОЗДАЁМ dict, В КОТОРЫХ КЛЮЧ - ИНН, ЗНАЧЕНИЕ - ОБЩАЯ СУММА ПЕРЕВОДОВ (kt)
dict_total_sum_kt = pd.Series(data_pays.total_sum_kt.values,index=data_pays.hash_inn_kt).to_dict()
# СОЗДАЁМ dict, В КОТОРЫХ КЛЮЧ - ИНН, ЗНАЧЕНИЕ - ОБЩАЯ СУММА "ПОЛУЧЕНИЙ" (dt)
dict_total_sum_dt = pd.Series(data_pays.total_sum_dt.values,index=data_pays.hash_inn_dt).to_dict()

In [17]:
# СОЗДАЁМ dict, В КОТОРЫХ КЛЮЧ - ИНН, ЗНАЧЕНИЕ - ОБЩАЯ ЧИСЛО ПЕРЕВОДОВ 
dict_nums_of_kt = pd.Series(data_pays.nums_of_kt.values,index=data_pays.hash_inn_kt).to_dict()
# СОЗДАЁМ dict, В КОТОРЫХ КЛЮЧ - ИНН, ЗНАЧЕНИЕ - ОБЩeЯ ЧИСЛО ПОЛУЧЕНИЙ 
dict_nums_of_dt = pd.Series(data_pays.nums_of_dt.values,index=data_pays.hash_inn_dt).to_dict()

In [18]:
# СОЗДАЁМ dict, В КОТОРЫХ КЛЮЧ - ИНН, ЗНАЧЕНИЕ - СРЕДНИЙ РАЗМЕР ТРАНЗАКЦИИ  
dict_mean_pay = pd.Series(data_pays.mean_pay.values,index=data_pays.hash_inn_kt).to_dict()

In [19]:
# Перенесём полученные фичи в dataframe data_info_all

data_info_all['total_sum_kt'] = data_info_all['hash_inn'].map(dict_total_sum_kt)
data_info_all['total_sum_dt'] = data_info_all['hash_inn'].map(dict_total_sum_dt)
data_info_all['nums_of_kt'] = data_info_all['hash_inn'].map(dict_nums_of_kt)
data_info_all['nums_of_dt'] = data_info_all['hash_inn'].map(dict_nums_of_dt)
data_info_all['mean_pay'] = data_info_all['hash_inn'].map(dict_mean_pay)
data_info_all_1 = pd.merge(data_pays[['hash_inn_kt','week']], data_info_all, left_on='hash_inn_kt', right_on='hash_inn', how='right')
data_info_all_2 = pd.merge(data_pays[['hash_inn_dt','week']], data_info_all, left_on='hash_inn_dt', right_on='hash_inn', how='right')
data_info_all = pd.concat([data_info_all_1, data_info_all_2], ignore_index=True)

In [20]:
# Разделим данные: есть таргет
data_info_train = data_info_all[data_info_all.is_public == True]
# /нет таргета
data_info_to_predict = data_info_all[data_info_all.is_public == False]

In [28]:
data_info_to_predict.isnull().sum()

hash_inn                      0
hash_inn_dt             1756947
hash_inn_kt             1804024
is_public                     0
mean_pay                 264708
nums_of_dt               124417
nums_of_kt               262459
okved2                        0
region                        0
total_sum_dt             124417
total_sum_kt             262459
unique_inn_in_region          0
week                      45298
dtype: int64

In [30]:
data_info_to_predict.shape

(3515673, 13)

In [24]:
data_info_train =data_info_train.fillna(-1)

In [25]:
sc= StandardScaler()
X = data_info_train[['total_sum_kt', 'total_sum_dt', 'hash_inn',
       'hash_inn_dt', 'hash_inn_kt', 'mean_pay',
       'region', 'nums_of_kt', 'nums_of_dt', 'unique_inn_in_region',
       'week']]
y = data_info_train[['okved2']].astype(int)

X = sc.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [27]:
clf = RandomForestClassifier(8, random_state=42, verbose=3, n_jobs=-1)
clf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 1 of 8
building tree 2 of 8
building tree 3 of 8building tree 4 of 8
building tree 5 of 8
building tree 6 of 8
building tree 7 of 8
building tree 8 of 8



MemoryError: Unable to allocate array with shape (5986266,) and data type int64

In [None]:
f1_score(y_test, clf.predict(X_test), average='micro')

### Create Submission ###

In [None]:
data_info_to_predict =data_info_predict.fillna(-1)

In [None]:
sc= StandardScaler()
X = data_info_train[['total_sum_kt', 'total_sum_dt', 'hash_inn',
       'hash_inn_dt', 'hash_inn_kt', 'mean_pay',
       'region', 'nums_of_kt', 'nums_of_dt', 'unique_inn_in_region',
       'week']]

X_to_predict = sc.fit_transform(X_to_predict)


In [None]:
predicted_okved2 = clf.predict(X)

In [None]:
submission = pd.DataFrame({'hash_inn':data_info_predict['hash_inn'],'y':predicted_okved2})

In [None]:
filename = 'StephanPushkov-9022020-SberbankIndustry.csv'

submission.to_csv(filename,index=False)

print('Saved file: ', filename)