In [84]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest

In [None]:
from utils import UtilsKy

In [None]:
np.random.seed(1)

In [4]:
df_operations = pd.read_csv(UtilsKy.GRAFANA_OPERATIONS, dtype=str, 
                            usecols = ['id', 'created', 'amount_usd', 'project_id', 'type', 'pan'])

In [7]:
df = df_operations[df_operations.type == 'charge'].copy()

In [8]:
df['pan'] = df['pan'].str[:6]

In [16]:
min_date = min(df.created)

In [17]:
max_date = max(df.created)

In [18]:
from datetime import timedelta, datetime

In [19]:
date_start_test = datetime.strptime(max_date, '%Y-%m-%d %H:%M:%S') - timedelta(days=14)

In [20]:
start_test = date_start_test.strftime('%Y-%m-%d %H:%M:%S')

In [35]:
teach =  df[(df.created < start_test)].copy()
test = df[(df.created >= start_test)].copy()

In [36]:
teach = teach.sample(n=100000, random_state=1)
test = test.sample(n=100000, random_state=1)

In [37]:
col_factors =['pan','amount_usd']
teach[col_factors] = teach[col_factors].apply(pd.to_numeric, errors="coerce")
test[col_factors] = test[col_factors].apply(pd.to_numeric, errors="coerce")

In [38]:
st_pid = teach.groupby('project_id').amount_usd.agg('mean').to_frame()

In [39]:
st_pid.columns

Index(['amount_usd'], dtype='object')

In [40]:
st_pid.reset_index(level=0, inplace=True)

In [41]:
st_pid.columns = ["project_id", 'mean_amount_usd_in_pid']

In [42]:
teach = pd.merge(teach, st_pid, how = 'left', on = ['project_id']).copy()
test = pd.merge(test, st_pid, how = 'left', on = ['project_id']).copy()

In [45]:
teach['avg_amount_usd_in_pid'] = teach.amount_usd/ teach.mean_amount_usd_in_pid
test['avg_amount_usd_in_pid'] = test.amount_usd/ test.mean_amount_usd_in_pid

In [46]:
teach.head()

Unnamed: 0,id,type,project_id,created,pan,amount_usd,mean_amount_usd_in_pid,avg_amount_usd_in_pid
0,134254234,charge,224f3d3120309f994e744af6ed64dddc18a82888,2020-03-06 10:25:52,414949,5.217,5.803871,0.898883
1,130036244,charge,51dfe45c685dbb9ac1b650122838facfb4f209f1,2020-02-07 02:41:35,489504,200.0,141.358674,1.414841
2,125652873,charge,8901f0b3f6f78a91dd7837199883454456f2ae37,2020-01-12 02:21:09,402944,26.48,94.430482,0.280418
3,124716124,charge,224f3d3120309f994e744af6ed64dddc18a82888,2020-01-07 06:43:07,411997,1.147,5.803871,0.197627
4,128324238,charge,224f3d3120309f994e744af6ed64dddc18a82888,2020-01-26 15:33:34,516922,3.422,5.803871,0.589607


In [52]:
col_factors = ['mean_amount_usd_in_pid', 'amount_usd']

In [86]:
replace_val = -9999

db_teach = teach[col_factors].fillna(replace_val)
db_test = test[col_factors].fillna(replace_val)

teach_d = db_teach.values
test_d = db_test.values

In [87]:
#db_teach.head()

In [57]:
#----------------------------------------------------------------------------------IsolationForest

In [88]:
model=IsolationForest(n_estimators=50, max_samples='auto', contamination=float(0.1),max_features=1.0)

In [89]:
model.fit(teach_d)

IsolationForest(contamination=0.1, n_estimators=50)

In [90]:
model.fit(test_d)

IsolationForest(contamination=0.1, n_estimators=50)

In [100]:
teach['score'] = model.decision_function(teach_d)
teach['anomaly'] = model.predict(teach_d)

In [101]:
test['score'] = model.decision_function(test_d)
test['anomaly'] = model.predict(test_d)

In [102]:
teach.head()

Unnamed: 0,mean_amount_usd_in_pid,amount_usd,preds,score,anomaly
0,5.803871,5.217,1,0.185061,1
1,141.358674,200.0,-1,-0.111062,-1
2,94.430482,26.48,-1,0.068805,1
3,5.803871,1.147,1,0.18537,1
4,5.803871,3.422,1,0.190996,1


In [104]:
teach.anomaly.value_counts()

 1    93414
-1     6586
Name: anomaly, dtype: int64

In [105]:
teach_anomaly = teach[teach.anomaly == -1]
test_anomaly = test[test.anomaly == -1]

In [106]:
min(test_anomaly.score)

-0.3402100728142933

In [107]:
max(test_anomaly.score)

-1.1347983426390762e-05

In [71]:
preds_teach = clf.fit_predict(teach_d)
preds_test = clf.fit_predict(test_d)

In [79]:
teach['preds'] = preds_teach
test['preds'] = preds_test

In [80]:
teach_anomaly = teach[teach.preds == -1]
test_anomaly = test[test.preds == -1]

In [82]:
teach_anomaly.preds.value_counts()

-1    24242
Name: preds, dtype: int64

In [83]:
test_anomaly.preds.value_counts()

-1    18246
Name: preds, dtype: int64