In [89]:
import numpy as np
import pandas as pd
import sklearn
from importlib import reload

pd.set_option('display.max_columns', 500)

In [80]:
data = pd.read_csv('type_classifier_data.csv', dtype={'inn': str}).dropna(subset=['inn']).drop_duplicates(subset=['inn']).set_index('inn')
targets = pd.read_csv('type_classifier_target.csv', dtype={'startup_inn': str}).set_index('startup_inn').sort_index()

In [81]:
y_cols = list(targets.columns)
x_cols = list(data.columns)

In [82]:
U = targets.join(data)

In [84]:
','.join(x_cols)

'name,got_support_from,did_get_support,service,foundation_date,tech_focus,stage_of_development,market,technology,business_model,main_okved,okved_secondary,msp_category,is_export,inno_cluster_member,skolcovo_member,is_inno_company,is_startup,current_profit,current_profit_tax,current_revenue'

In [86]:
U.head(2)

Unnamed: 0,round,accelerator,business_angel,corp_fund,corp_investor,gov_fund,private_fund,name,got_support_from,did_get_support,service,foundation_date,tech_focus,stage_of_development,market,technology,business_model,main_okved,okved_secondary,msp_category,is_export,inno_cluster_member,skolcovo_member,is_inno_company,is_startup,current_profit,current_profit_tax,current_revenue
1615013084,seed,1.0,0.0,0.0,0.0,0.0,0.0,q-platform,московский акселератор,да,добавлено - московский акселератор,2016-10-14,,,,,,62.01,62.02_62.03_62.09,,нет,нет,нет,нет,нет,20425.0,0.0,295.0
1650350558,pre-seed,1.0,0.0,0.0,0.0,0.0,0.0,"ооо ""воплощение""",московский акселератор,нет,московский акселератор,2017-06-22,,,,,,28.96,22.21_22.29_22.29.9,юл микро,нет,нет,нет,нет,нет,0.0,0.0,0.0


In [90]:
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
import my_sklearn_transformers as trsfm
reload(trsfm)

preprocessor_X = make_union(
    make_pipeline(
        trsfm.DfSelector(('round', 'tech_focus', 'stage_of_development', 'market', 'technology', 'business_model', 'main_okved', 'okved_secondary', 'msp_category', 'is_export', 'inno_cluster_member', 'skolcovo_member', 'is_inno_company', 'is_startup')),
        SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=''),
        OneHotEncoder(sparse=False, handle_unknown='ignore')
    )
)

preprocessor_Y = make_union(
    make_pipeline(
        trsfm.DfSelector(('accelerator', 'business_angel', 'corp_fund', 'corp_investor', 'gov_fund', 'private_fund',)),
    )
)

In [91]:
X = preprocessor_X.fit_transform(U)
y = preprocessor_Y.fit_transform(U)

In [93]:
X.shape, y.shape

((371, 433), (371, 6))

In [116]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.losses import CategoricalCrossentropy

model = tf.keras.Sequential(layers=[
    layers.Dense(16, input_shape=(X.shape[1],), activation='sigmoid'),
    layers.Dense(y.shape[1], activation='sigmoid')
])

model.compile(optimizer='adam', loss=CategoricalCrossentropy())

model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_17 (Dense)             (None, 16)                6944      
_________________________________________________________________
dense_18 (Dense)             (None, 6)                 102       
Total params: 7,046
Trainable params: 7,046
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X, y, epochs=100)

In [118]:
model.predict(X[:5])

array([[0.06254685, 0.06272125, 0.02092662, 0.0311116 , 0.01309484,
        0.04951277],
       [0.4403635 , 0.04912612, 0.01700526, 0.02400869, 0.0061506 ,
        0.03970525],
       [0.36445695, 0.04226571, 0.01691654, 0.02927184, 0.0069699 ,
        0.03824052],
       [0.02857369, 0.03332874, 0.01404771, 0.02659208, 0.02007496,
        0.12149206],
       [0.05115482, 0.15829754, 0.02215114, 0.02052039, 0.01036869,
        0.04951105]], dtype=float32)

In [119]:
U.head()

Unnamed: 0,round,accelerator,business_angel,corp_fund,corp_investor,gov_fund,private_fund,name,got_support_from,did_get_support,service,foundation_date,tech_focus,stage_of_development,market,technology,business_model,main_okved,okved_secondary,msp_category,is_export,inno_cluster_member,skolcovo_member,is_inno_company,is_startup,current_profit,current_profit_tax,current_revenue
1615013084,seed,1.0,0.0,0.0,0.0,0.0,0.0,q-platform,московский акселератор,да,добавлено - московский акселератор,2016-10-14,,,,,,62.01,62.02_62.03_62.09,,нет,нет,нет,нет,нет,20425.0,0.0,295.0
1650350558,pre-seed,1.0,0.0,0.0,0.0,0.0,0.0,"ооо ""воплощение""",московский акселератор,нет,московский акселератор,2017-06-22,,,,,,28.96,22.21_22.29_22.29.9,юл микро,нет,нет,нет,нет,нет,0.0,0.0,0.0
1650390180,pre-seed,1.0,0.0,0.0,0.0,0.0,0.0,"ооо ""веса систем""",московский акселератор,нет,московский акселератор,2020-04-09,,,,,,62.01,26.11_26.12_26.30,юл микро,нет,нет,нет,нет,нет,216.0,-97.0,3217.0
1655449455,seed,0.0,0.0,0.0,0.0,0.0,1.0,"ооо ""цифровое здоровье""",starthub.moscow,нет,starthub.moscow,2020-12-07,,,,,,72.19,26.20_26.30_62.01,юл микро,нет,нет,нет,нет,нет,0.0,0.0,0.0
1657238107,seed,0.0,1.0,0.0,0.0,0.0,0.0,"ооо ""лоутрип""",карта инновационных решений,да,добавлено - карта инновационных решений,2017-06-15,,,,,,63.11.1,,,нет,нет,нет,нет,нет,0.0,0.0,0.0
