In [7]:
import matplotlib.pyplot as plt  # plots
import numpy as np  # vectors and matrices
import pandas as pd  # tables and data manipulations
import seaborn as sns  # more plots
from sklearn.metrics import (mean_absolute_error, mean_squared_error,
                             mean_squared_log_error, median_absolute_error,
                             r2_score)
import warnings  
from itertools import product  # some useful functions
import scipy.stats as scs
import statsmodels.api as sm
import statsmodels.formula.api as smf  # statistics and econometrics
import statsmodels.tsa.api as smt
from dateutil.relativedelta import relativedelta  # working with dates with style
from scipy.optimize import minimize  # for function minimization
from tqdm.notebook import tqdm

In [8]:
sns.set()
warnings.filterwarnings("ignore") # `do not disturbe` mode

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [9]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [10]:
def moving_average(series, n):
    return np.average(series[-n:])

In [11]:
def anomalies_bonds(series, window=4, scale=1.96):
    
    result = pd.DataFrame()
    rolling_mean = series.rolling(window=window).mean()
    
    mae = mean_absolute_error(series[window:], rolling_mean[window:])
    deviation = np.std(series[window:] - rolling_mean[window:])
    lower_bond = rolling_mean - (mae + scale * deviation)
    upper_bond = rolling_mean + (mae + scale * deviation)
    result['lower_bond'] = lower_bond
    result['upper_bond'] = upper_bond
    anomalies = pd.DataFrame(index=series.index, columns=series.columns)
    anomalies[series < lower_bond] = series[series < lower_bond]
    anomalies[series > upper_bond] = series[series > upper_bond]
    result['anomalies'] = anomalies
    
    return result

In [20]:
def sarima_anomalies_detect(load_path):
    dataframe = pd.read_csv(load_path)
    id_ves = pd.Series(dataframe['id.ves']).unique()
    rate_ves = pd.DataFrame()
    for ves in id_ves:
        aggr_df = dataframe.loc[dataframe['id.ves']==ves].groupby("date")[["catch.volume"]].sum()
        if aggr_df.size > 20:
            anomalies = anomalies_bonds(aggr_df)    
            aggr_df['id.ves'] = ves
            aggr_df['id.own'] = dataframe.loc[dataframe['id.ves']==ves]['id.own'].max()
            aggr_df = pd.concat([aggr_df, anomalies], axis=1)
            aggr_df = aggr_df.reset_index()
            rate_ves = pd.concat([rate_ves, aggr_df], axis=0)

    rate_ves.loc[(rate_ves.anomalies > 0), 'anomalies']  = 1
    rate_ves.loc[(rate_ves.anomalies.isna()), 'anomalies']  = 0
    save_path = f'{load_path.split(".csv")[0]}_sarima_predict.csv'
    rate_ves.to_csv(f'{save_path}')

In [21]:
load_path = '/home/savin/Documents/DEV/dataset_fish/Датасет/test_data/db1/catch.csv'
sarima_anomalies_detect(load_path)