In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from tqdm import tqdm_notebook

In [3]:
!ls drive/MyDrive/data/laws/ria_reports/ria_reports

regulations.csv			      ria_reports_kpi.csv
ria_reports_business_profit_loss.csv  ria_reports_main.csv
ria_reports_business_sizes_as_is.csv  ria_reports_necessary_measures.csv
ria_reports_business_sizes_to_be.csv  ria_reports_new_functions.csv
ria_reports_cancel_duties.csv	      ria_reports_notification_info.csv
ria_reports_expenses.csv	      ria_reports_public_discussion.csv
ria_reports_goals.csv		      ria_reports_risks.csv
ria_reports_group_changes.csv	      sample_submission.csv
ria_reports_group_expenses.csv	      train_answer.csv
ria_reports_groups.csv


In [4]:
reg = pd.read_csv("drive/MyDrive/data/laws/regulations.csv")
target = pd.read_csv("drive/MyDrive/data/laws/train_answer.csv")

In [5]:
#cleansing
reg = reg[reg['act_title'] != "Проект удален"]
reg = reg.dropna(subset=['act_title'])

In [6]:
import os
for dirname, _, filenames in os.walk('drive/MyDrive/data/laws/ria_reports/ria_reports'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

drive/MyDrive/data/laws/ria_reports/ria_reports/ria_reports_business_profit_loss.csv
drive/MyDrive/data/laws/ria_reports/ria_reports/ria_reports_business_sizes_as_is.csv
drive/MyDrive/data/laws/ria_reports/ria_reports/ria_reports_business_sizes_to_be.csv
drive/MyDrive/data/laws/ria_reports/ria_reports/ria_reports_cancel_duties.csv
drive/MyDrive/data/laws/ria_reports/ria_reports/ria_reports_expenses.csv
drive/MyDrive/data/laws/ria_reports/ria_reports/ria_reports_groups.csv
drive/MyDrive/data/laws/ria_reports/ria_reports/ria_reports_kpi.csv
drive/MyDrive/data/laws/ria_reports/ria_reports/ria_reports_main.csv
drive/MyDrive/data/laws/ria_reports/ria_reports/ria_reports_group_changes.csv
drive/MyDrive/data/laws/ria_reports/ria_reports/ria_reports_goals.csv
drive/MyDrive/data/laws/ria_reports/ria_reports/ria_reports_group_expenses.csv
drive/MyDrive/data/laws/ria_reports/ria_reports/ria_reports_new_functions.csv
drive/MyDrive/data/laws/ria_reports/ria_reports/ria_reports_public_discussion.csv

In [7]:
import glob

for report_path in tqdm_notebook(glob.glob('drive/MyDrive/data/laws/ria_reports/ria_reports/ria_reports_*.csv*')):
    report = pd.read_csv(report_path, sep=';')
    reg = reg.join(report.drop_duplicates(subset=['regulation_project_id']).set_index('regulation_project_id'), on = 'id', lsuffix='_l', rsuffix='_r')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/16 [00:00<?, ?it/s]

In [8]:
#cleansing
for i, col in enumerate(reg.columns):
    reg[col] = reg[col].fillna(reg[col].dropna().value_counts().index[0])
    if reg[col].dtype=='object':
      reg[col]=reg[col].apply(lambda x: str(x).lower())
reg['year'] = reg.publication_date.apply(lambda s: int(s[:4])).astype(int)
reg['month'] = reg.publication_date.apply(lambda s: int(s[5:7])).astype(int)
reg['date'] = reg.publication_date.apply(lambda s: int(s[8:10])).astype(int)
reg.drop(['publication_date'], axis=1, inplace=True)
OKVED_DIM = 4
reg['okved_list'] = reg['okved_list'].str.split('; ')
reg['okved_list'] = reg['okved_list'].fillna("").apply(list)
okved_categories = reg['okved_list'].explode().dropna().unique()
okved_mapping = {k: v for v, k in enumerate(okved_categories, 1)}
reg['okved_list'] = reg['okved_list'].apply(lambda x: ([okved_mapping[cat] for cat in x] + [0] * OKVED_DIM)[:OKVED_DIM])
for i in range(4):
  reg[f'okved_list_{i}']=reg.okved_list.apply(lambda lst: lst[i])
reg.drop(['okved_list'], axis=1, inplace=True)

In [10]:
X = reg.set_index('id').join(target.set_index('id'))
X_test = X.loc[X.passed.isna()]
X = X.dropna(subset=['passed'])

In [12]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.0.3-cp37-none-manylinux1_x86_64.whl (76.3 MB)
[K     |████████████████████████████████| 76.3 MB 34 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.3


In [14]:
from catboost import CatBoostClassifier
model = CatBoostClassifier()
model.load_model("drive/MyDrive/data/laws/brute")

<catboost.core.CatBoostClassifier at 0x7fd82773af10>

In [16]:
result = model.predict_proba(X_test)

In [17]:
submission = pd.read_csv('drive/MyDrive/data/laws/sample_submission.csv')
submission.id = X_test.index
submission.passed=result
submission.passed = 1-submission.passed

In [19]:
submission.to_csv('my_submission.csv', index=False)