In [1]:
%load_ext autoreload
import os
import sys
import pandas as pd
from IPython.display import display
sys.path.append(os.path.join('../src/utils'))
sys.path.append(os.path.join('../src/'))
import streamlit_funcs

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [3]:
DB1_FOLDER = '../data/db1'
DB2_FOLDER = '../data/db2'

In [4]:
db1 = streamlit_funcs.load_db1(DB1_FOLDER)

In [5]:
ext1, ext2 = streamlit_funcs.load_db2(DB2_FOLDER)

In [6]:
db2 = streamlit_funcs.get_db2(ext1, ext2)

# DB1

In [7]:
db1.columns = [col if col != 'catch_date' else 'date' for col in db1.columns]
db1['date'] = pd.to_datetime(db1['date']).dt.date
db1_aggregated = db1.groupby(['id_ves', 'date', 'id_fish'])['catch_volume'].sum().reset_index()
display(db1_aggregated.head())

Unnamed: 0,id_ves,date,id_fish,catch_volume
0,2,2022-01-10,848,17.0
1,2,2022-01-11,848,20.0
2,2,2022-01-22,849,1342.0
3,2,2022-01-23,849,2078.0
4,2,2022-01-24,849,97.0


# DB2

In [8]:
db2_aggregated = db2.groupby(['id_ves', 'date', 'id_fish'])['volume'].sum().reset_index()
db2_aggregated['id_ves'] = db2_aggregated['id_ves'].astype(int)
db2_aggregated['id_fish'] = db2_aggregated['id_fish'].astype(int)

In [9]:
fishes = db2[['id_fish', 'fish']].drop_duplicates()

In [10]:
db2_aggregated['volume_div_1000'] = db2_aggregated['volume'] / 1000
db2_aggregated['volume_div_100'] = db2_aggregated['volume'] / 100

joined_bases = db1_aggregated.merge(db2_aggregated, on=['id_ves', 'id_fish', 'date'], how='inner')
joined_bases = joined_bases.merge(fishes, on='id_fish', how='left')

joined_bases = streamlit_funcs.calc_mismatch(joined_bases)

threshold = 25
joined_bases['threshold_volume'] = threshold
joined_bases['is_abnormal'] = joined_bases['mismatch, %'] > joined_bases['threshold_volume']
joined_bases['is_abnormal'] = joined_bases['is_abnormal'].astype(int)

new_good = streamlit_funcs.find_good_by_shift(joined_bases, db2_aggregated, shift=3)
new_good_index = new_good['index'].to_list()
new_good = new_good.drop(columns=['index'])
joined_bases = joined_bases.drop(index=new_good_index)
joined_bases = pd.concat([joined_bases, new_good]).sort_index()

col_order = ['id_ves', 'date', 'id_fish', 'fish', 'catch_volume', 'volume',
              'volume_div_1000', 'volume_div_100', 'mismatch, %', 'threshold_volume', 'is_abnormal']

joined_bases = joined_bases[col_order]

col_names = ['id судна (id_ves)', 'дата', 'id рыбы','назавание рыбы', 'улов',
            'внесено в базу (размерность неизвестна)', 'внесено в базу (коррекция 1/1000)', 'внесено в базу (коррекция 1/100)',
            'отклонение внесенного от выловленного, %', 'порог отклонения, %', 'является ли подозрительным']

colnames_map = dict(zip(col_order, col_names))

joined_bases.columns = [colnames_map[col] for col in joined_bases.columns]

joined_bases = joined_bases.loc[joined_bases['является ли подозрительным'] == 1, :].sort_values(by='отклонение внесенного от выловленного, %', ascending=False)

In [11]:
%autoreload 2
test_joined = streamlit_funcs.aggregate_db1_db2_table(db1, db2, threshold=25, shift=3)

In [12]:
joined_bases

Unnamed: 0,id судна (id_ves),дата,id рыбы,назавание рыбы,улов,внесено в базу (размерность неизвестна),внесено в базу (коррекция 1/1000),внесено в базу (коррекция 1/100),"отклонение внесенного от выловленного, %","порог отклонения, %",является ли подозрительным
29117,1510,2022-01-11,292,треска,670.0,0,0.000,0.00,100.00,25,1
29114,1510,2022-01-08,403,навага,30.0,0,0.000,0.00,100.00,25,1
29126,1510,2022-01-13,677,бычки,200.0,0,0.000,0.00,100.00,25,1
29125,1510,2022-01-13,451,камбалы дальневосточные,50.0,0,0.000,0.00,100.00,25,1
29115,1510,2022-01-08,451,камбалы дальневосточные,300.0,0,0.000,0.00,100.00,25,1
...,...,...,...,...,...,...,...,...,...,...,...
18563,1424,2022-02-26,692,терпуг,539.0,376,0.376,3.76,30.24,25,1
13754,1383,2022-01-24,617,зубатка пестрая,228.0,163,0.163,1.63,28.51,25,1
18615,1424,2022-03-19,112,палтус белокорый,1316.0,947,0.947,9.47,28.04,25,1
23413,1460,2022-02-04,147,окунь золотистый,2786.0,3494,3.494,34.94,25.41,25,1


In [13]:
(test_joined != joined_bases).sum().sum()

0