In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest

In [5]:
from utils import UtilsKy

In [6]:
np.random.seed(1)

In [8]:
df_operations = pd.read_csv(UtilsKy.GRAFANA_OPERATIONS, dtype=str, 
                            usecols = ['id', 'created', 'amount_usd', 'project_id', 'type', 'pan'])

In [9]:
df = df_operations[df_operations.type == 'charge'].copy()

In [10]:
df['pan'] = df['pan'].str[:6]

In [11]:
min_date = min(df.created)

In [12]:
max_date = max(df.created)

In [13]:
from datetime import timedelta, datetime

In [14]:
date_start_test = datetime.strptime(max_date, '%Y-%m-%d %H:%M:%S') - timedelta(days=14)

In [15]:
start_test = date_start_test.strftime('%Y-%m-%d %H:%M:%S')

In [16]:
teach =  df[(df.created < start_test)].copy()
test = df[(df.created >= start_test)].copy()

In [17]:
teach = teach.sample(n=100000, random_state=1)
test = test.sample(n=100000, random_state=1)

In [18]:
col_factors =['pan','amount_usd']
teach[col_factors] = teach[col_factors].apply(pd.to_numeric, errors="coerce")
test[col_factors] = test[col_factors].apply(pd.to_numeric, errors="coerce")

In [19]:
st_pid = teach.groupby('project_id').amount_usd.agg('mean').to_frame()

In [20]:
st_pid.columns

Index(['amount_usd'], dtype='object')

In [21]:
st_pid.reset_index(level=0, inplace=True)

In [22]:
st_pid.columns = ["project_id", 'mean_amount_usd_in_pid']

In [23]:
teach = pd.merge(teach, st_pid, how = 'left', on = ['project_id']).copy()
test = pd.merge(test, st_pid, how = 'left', on = ['project_id']).copy()

In [24]:
teach['avg_amount_usd_in_pid'] = teach.amount_usd/ teach.mean_amount_usd_in_pid
test['avg_amount_usd_in_pid'] = test.amount_usd/ test.mean_amount_usd_in_pid

In [25]:
teach.head()

Unnamed: 0,id,type,project_id,created,pan,amount_usd,mean_amount_usd_in_pid,avg_amount_usd_in_pid
0,134254234,charge,224f3d3120309f994e744af6ed64dddc18a82888,2020-03-06 10:25:52,414949,5.217,5.803871,0.898883
1,130036244,charge,51dfe45c685dbb9ac1b650122838facfb4f209f1,2020-02-07 02:41:35,489504,200.0,141.358674,1.414841
2,125652873,charge,8901f0b3f6f78a91dd7837199883454456f2ae37,2020-01-12 02:21:09,402944,26.48,94.430482,0.280418
3,124716124,charge,224f3d3120309f994e744af6ed64dddc18a82888,2020-01-07 06:43:07,411997,1.147,5.803871,0.197627
4,128324238,charge,224f3d3120309f994e744af6ed64dddc18a82888,2020-01-26 15:33:34,516922,3.422,5.803871,0.589607


In [26]:
col_factors = ['mean_amount_usd_in_pid', 'amount_usd']

In [27]:
replace_val = -9999

db_teach = teach[col_factors].fillna(replace_val)
db_test = test[col_factors].fillna(replace_val)

teach_d = db_teach.values
test_d = db_test.values

In [87]:
#db_teach.head()

In [57]:
#----------------------------------------------------------------------------------IsolationForest

In [28]:
model=IsolationForest(n_estimators=50, max_samples='auto', contamination=float(0.1),max_features=1.0)

In [29]:
model.fit(teach_d)

IsolationForest(contamination=0.1, n_estimators=50)

In [30]:
model.fit(test_d)

IsolationForest(contamination=0.1, n_estimators=50)

In [31]:
teach['score'] = model.decision_function(teach_d)
teach['anomaly'] = model.predict(teach_d)

In [32]:
test['score'] = model.decision_function(test_d)
test['anomaly'] = model.predict(test_d)

In [33]:
teach.head()

Unnamed: 0,id,type,project_id,created,pan,amount_usd,mean_amount_usd_in_pid,avg_amount_usd_in_pid,score,anomaly
0,134254234,charge,224f3d3120309f994e744af6ed64dddc18a82888,2020-03-06 10:25:52,414949,5.217,5.803871,0.898883,0.180493,1
1,130036244,charge,51dfe45c685dbb9ac1b650122838facfb4f209f1,2020-02-07 02:41:35,489504,200.0,141.358674,1.414841,-0.121223,-1
2,125652873,charge,8901f0b3f6f78a91dd7837199883454456f2ae37,2020-01-12 02:21:09,402944,26.48,94.430482,0.280418,0.074856,1
3,124716124,charge,224f3d3120309f994e744af6ed64dddc18a82888,2020-01-07 06:43:07,411997,1.147,5.803871,0.197627,0.181233,1
4,128324238,charge,224f3d3120309f994e744af6ed64dddc18a82888,2020-01-26 15:33:34,516922,3.422,5.803871,0.589607,0.181336,1


In [34]:
teach.anomaly.value_counts()

 1    93349
-1     6651
Name: anomaly, dtype: int64

In [35]:
teach_anomaly = teach[teach.anomaly == -1]
test_anomaly = test[test.anomaly == -1]

In [66]:
min(teach_anomaly.score)

-0.35713925387970646

In [68]:
max(teach_anomaly.score)

-4.3584614744296424e-05

In [40]:
teach_neg_score = teach[teach.score < 0]
test_neg_score = test[test.score < 0]

In [85]:
teach_neg_3 = teach[teach.score < -0.3]

In [87]:
#teach_neg_3.score.value_counts()

In [88]:
teach_neg_3

Unnamed: 0,id,type,project_id,created,pan,amount_usd,mean_amount_usd_in_pid,avg_amount_usd_in_pid,score,anomaly
3132,133672243,charge,f45a5bbc33fd242e9dc499bf05cbc959de0806a9,2020-03-02 11:36:10,427666,1800.00000,649.012308,2.773445,-0.322626,-1
3718,127447607,charge,d816262b4b2aa834b6b4c6d989b48c03b056f2b4,2020-01-21 14:42:10,546969,1000.00000,2309.765806,0.432944,-0.320375,-1
3908,128479128,charge,8e7ca09b25c7e959b55c2790b6a2bdcb5d56d2cb,2020-01-27 14:53:46,516949,2508.00000,906.300000,2.767296,-0.335119,-1
4279,130191551,charge,3b51cee05a380029cab7a06e876d2c006b500752,2020-02-07 23:21:20,490638,1673.50860,524.451450,3.190970,-0.322626,-1
4711,126292493,charge,f0660cc875a7f0cfa90cab60dda8eef9f4121e9c,2020-01-15 17:44:11,427901,2100.00000,359.884054,5.835213,-0.321500,-1
5068,134113420,charge,d816262b4b2aa834b6b4c6d989b48c03b056f2b4,2020-03-05 12:51:36,427616,1001.00000,2309.765806,0.433377,-0.320375,-1
5610,133372906,charge,f5213d04f0f312c47554f1c23d2bfdbae686c95c,2020-02-29 16:11:25,406587,1100.00000,651.079484,1.689502,-0.306993,-1
6344,127921577,charge,51dfe45c685dbb9ac1b650122838facfb4f209f1,2020-01-24 12:21:55,454314,3000.00000,141.358674,21.222610,-0.318129,-1
6359,133801577,charge,f0660cc875a7f0cfa90cab60dda8eef9f4121e9c,2020-03-03 10:38:51,553691,2694.61000,359.884054,7.487439,-0.331693,-1
6592,134732145,charge,f0660cc875a7f0cfa90cab60dda8eef9f4121e9c,2020-03-09 12:32:58,470127,3360.00000,359.884054,9.336340,-0.331693,-1
