In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import os
import math
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing


%matplotlib inline

In [2]:
tf.version.VERSION

'2.4.1'

In [3]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
types = {
    'ИНН': np.int64,
}
dfs = pd.read_csv(r'data\data.csv', dtype=types)
dfs.head()

Unnamed: 0,KBK,ОКВЭД2,"Вид деятельности, основной ТАСС","Вид деятельности, дополнительный ТАСС",Организационно правовая форма,Атрибуты предприятия,Отрасль,ОГРН,ИНН,КПП,Регион
0,02004111950168580812,,Торговля оптовая производственным электротехни...,"Производство прочих химических продуктов, не в...",НЕПУБЛИЧНЫЕ АКЦИОНЕРНЫЕ ОБЩЕСТВА [7026704],,Радиоэлектронная промышленность [258],1127847272629,7814536770,,Санкт-Петербург [3522]
1,02004121840168844452,Производство оружия и боеприпасов [27574],"Строительство кораблей, судов и плавучих конст...",Производство оружия и боеприпасов [27575] / Пр...,НЕПУБЛИЧНЫЕ АКЦИОНЕРНЫЕ ОБЩЕСТВА [7026704],Организация ОПК [8780665] / Системообразующее ...,Судостроительная промышленность [262],1079847085966,7838395215,783801001.0,Санкт-Петербург [3522]
2,73004122140062870451,Научные исследования и разработки в области ес...,Научные исследования и разработки в области ес...,Производство автоматических космических аппара...,НЕПУБЛИЧНЫЕ АКЦИОНЕРНЫЕ ОБЩЕСТВА [7026704],Системообразующее предприятие [8690579] / Орга...,Транспортное машиностроение [264],1082452000290,2452034898,245201001.0,Красноярский край [3496]
3,02006051610166750811,Производство автотранспортных средств [27850] ...,Производство автотранспортных средств [27850],"Производство двигателей и турбин, кроме авиаци...",ОБЩЕСТВА С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ [7026...,,Автомобильная промышленность [246],1023900987626,3906092856,,Калининградская область [3485]
4,02004121616713810242,Производство автотранспортных средств [27850],Производство автотранспортных средств [27850],Торговля оптовая легковыми автомобилями и легк...,НЕПУБЛИЧНЫЕ АКЦИОНЕРНЫЕ ОБЩЕСТВА [7026704],Системообразующее предприятие [8690579],Автомобильная промышленность [246],1027739178202,7709259743,772301001.0,Москва [3505]


In [5]:
with open(r'kbk.json', 'r') as f:
    index_to_kbk = json.load(f)
    kbk_to_index = {val: int(key) for key, val in index_to_kbk.items()}

In [6]:
with open(r'acting.json', 'r') as f:
    index_to_acting = json.load(f)
    acting_to_index = {val: int(key) for key, val in index_to_acting.items()}

In [7]:
with open(r'attr.json', 'r') as f:
    index_to_attr = json.load(f)
    attr_to_index = {val: int(key) for key, val in index_to_attr.items()}

In [8]:
with open(r'forma.json', 'r') as f:
    index_to_forma = json.load(f)
    forma_to_index = {val: int(key) for key, val in index_to_forma.items()}

In [9]:
with open(r'ocved.json', 'r') as f:
    index_to_ocved = json.load(f)
    ocved_to_index = {val: int(key) for key, val in index_to_ocved.items()}

In [10]:
with open(r'otrasl.json', 'r') as f:
    index_to_otrasl = json.load(f)
    otrasl_to_index = {val: int(key) for key, val in index_to_otrasl.items()}

In [11]:
with open(r'region.json', 'r') as f:
    index_to_region = json.load(f)
    region_to_index = {val: int(key) for key, val in index_to_region.items()}

In [12]:
def get_data(df, split=None):
    x = []
    y = []
    for _, line in df.iterrows():
        tmp = []
        
        # ОКВЭД2
        if type(line['ОКВЭД2']) != float:
            ocved = line['ОКВЭД2'].split(' / ')[:5]
            tmp += [ocved_to_index[i] for i in ocved]
            if len(ocved) < 5:
                tmp += [0] * (5-len(ocved))
        else:
            tmp += [ocved_to_index['-1']] + [0] * 4
        
        
        # основной ТАСС
        if type(line['Вид деятельности, основной ТАСС']) != float:
            tmp += [acting_to_index[line['Вид деятельности, основной ТАСС']]]
        else:
            tmp += [acting_to_index['-1']]
        
        # дополнительный ТАСС
        if type(line['Вид деятельности, дополнительный ТАСС']) != float:
            dop = line['Вид деятельности, дополнительный ТАСС'].split(' / ')[:80]
            tmp += [acting_to_index[i] for i in dop]
            if len(dop) < 80:
                tmp += [0] * (80-len(dop))
        else:
            tmp += [acting_to_index['-1']] + [0] * 79
            
        # Атрибуты предприятия
        if type(line['Атрибуты предприятия']) != float:
            attrs = line['Атрибуты предприятия'].split(' / ')[:3]
            tmp += [attr_to_index[i] for i in attrs]
            if len(attrs) < 3:
                tmp += [0] * (3-len(attrs))
        else:
            tmp += [attr_to_index['-1']] + [0] * 2
            
        # Отрасль
        if type(line['Отрасль']) != float:
            otr = line['Отрасль'].split(' / ')[:2]
            tmp += [otrasl_to_index[i] for i in otr]
            if len(otr) < 2:
                tmp += [0] * (2-len(otr))
        else:
            tmp += [otrasl_to_index['-1']] + [0]
        
        if type(line['Регион']) != float:
            tmp += [region_to_index[line['Регион']]]
        else:
            tmp += [region_to_index['-1']]
            
        if type(line['Организационно правовая форма']) != float:
            tmp += [forma_to_index[line['Организационно правовая форма']]]
        else:
            tmp += [forma_to_index['-1']]
            
        x.append(tmp)
        y.append(kbk_to_index[line['KBK']])
        
    if split is not None:
        return train_test_split(x, y, test_size=split, shuffle=True, stratify=y)
        
    return x, y

In [185]:
x_train, x_test, y_train, y_test = get_data(dfs, split=0.3)

In [186]:
np.shape(x_train), np.shape(y_train), np.shape(x_test), np.shape(y_test)

((9938, 93), (9938,), (4260, 93), (4260,))

In [187]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))

In [114]:
model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=5)
model.fit(x_train[:100], y_train[:100])

GradientBoostingClassifier(max_depth=5, n_estimators=200)

In [115]:
accuracy_score(y_train[:100], model.predict(x_train[:100]))

0.65

In [213]:
tf_model = tf.keras.Sequential([
    tf.keras.layers.InputLayer((tf.shape(x_train)[-1],)),
    tf.keras.layers.Dense(256, activation=tf.keras.activations.softsign),
    tf.keras.layers.GaussianDropout(0.3),
    tf.keras.layers.Dense(len(list(index_to_kbk.keys()))),
])

tf_model.summary()

Model: "sequential_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_38 (Dense)             (None, 256)               24064     
_________________________________________________________________
gaussian_dropout_12 (Gaussia (None, 256)               0         
_________________________________________________________________
dense_39 (Dense)             (None, 567)               145719    
Total params: 169,783
Trainable params: 169,783
Non-trainable params: 0
_________________________________________________________________


In [214]:
tf_model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

In [215]:
class PrintCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if epoch == 0 or (epoch+1) % 10 == 0:
            tf.print('Epoch {:3d}: loss - {:.4f}, val_loss - {:.4f}, acc - {:.4f}, val_acc - {:.4f}'.format(
                epoch+1, logs['loss'], logs['val_loss'], logs['accuracy'], logs['val_accuracy']
            ))

In [217]:
tf_model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=100, verbose=0, callbacks=[PrintCallback()])

Epoch   1: loss - 2.6338, val_loss - 2.5848, acc - 0.3262, val_acc - 0.3547
Epoch  10: loss - 2.6002, val_loss - 2.5830, acc - 0.3340, val_acc - 0.3624
Epoch  20: loss - 2.6246, val_loss - 2.5846, acc - 0.3298, val_acc - 0.3617
Epoch  30: loss - 2.5568, val_loss - 2.5511, acc - 0.3435, val_acc - 0.3589
Epoch  40: loss - 2.5423, val_loss - 2.4992, acc - 0.3364, val_acc - 0.3782
Epoch  50: loss - 2.4822, val_loss - 2.5042, acc - 0.3491, val_acc - 0.3685
Epoch  60: loss - 2.4534, val_loss - 2.4339, acc - 0.3531, val_acc - 0.3958
Epoch  70: loss - 2.4893, val_loss - 2.4695, acc - 0.3496, val_acc - 0.3758
Epoch  80: loss - 2.4566, val_loss - 2.4667, acc - 0.3556, val_acc - 0.3829
Epoch  90: loss - 2.4201, val_loss - 2.4360, acc - 0.3621, val_acc - 0.3862
Epoch 100: loss - 2.3930, val_loss - 2.4067, acc - 0.3651, val_acc - 0.3883


<tensorflow.python.keras.callbacks.History at 0x25d8c65aa08>

In [183]:
dataset = pd.read_csv(r'data\dataset.csv')
y_train = dataset.pop('label')

est_dataset = tf.data.Dataset.from_tensor_slices((dict(dataset), y_train)).repeat(20).batch(len(y_train))

NUMERIC_COLUMNS = list(dataset.columns)
feature_columns = []

for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.int64))

In [184]:
est = tf.estimator.BoostedTreesClassifier(feature_columns, n_batches_per_layer=1, n_classes=len(y_train.unique()))
est.train(est_dataset, max_steps=100)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\TheDim0n\\AppData\\Local\\Temp\\tmp96qn3r5v', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


TypeError: unsupported callable

In [171]:
len(feature_columns)

93

In [174]:
len(y_train.unique())

567