In [1]:
import numpy as np
import pandas as pd

In [3]:
from pyod.models.knn import KNN

In [4]:
from pyod.utils.data import generate_data

In [5]:
from pyod.utils.data import evaluate_print

In [6]:
from utils import UtilsKy

In [7]:
#from pyod.utils.example import visualize TODO cannot import name 'njit'

In [7]:
df_operations = pd.read_csv(UtilsKy.GRAFANA_OPERATIONS, dtype=str, 
                            usecols = ['id', 'created', 'amount_usd', 'project_id', 'type', 'pan'])

In [8]:
df_operations.columns.tolist()

['id', 'type', 'project_id', 'created', 'pan', 'amount_usd']

In [9]:
df = df_operations[df_operations.type == 'charge'].copy()

In [10]:
df['pan'] = df['pan'].str[:6]

In [11]:
min_date = min(df.created)
min_date

'2020-01-01 00:00:10'

In [13]:
max_date = max(df.created)
max_date

'2020-03-25 23:59:57'

In [14]:
from datetime import timedelta, datetime

In [15]:
date_start_test = datetime.strptime(max_date, '%Y-%m-%d %H:%M:%S') - timedelta(days=14)

In [16]:
start_test = date_start_test.strftime('%Y-%m-%d %H:%M:%S')
start_test

'2020-03-11 23:59:57'

In [17]:
teach =  df[(df.created < start_test)].copy()
test = df[(df.created >= start_test)].copy()

In [18]:
teach = teach.sample(n=100000, random_state=1)
test = test.sample(n=100000, random_state=1)

In [19]:
col_factors =['amount_usd', 'pan']
teach[col_factors] = teach[col_factors].apply(pd.to_numeric, errors="coerce")
test[col_factors] = test[col_factors].apply(pd.to_numeric, errors="coerce")

In [None]:
# amount_avg_in_pid ----------------------------------------------------------------------------------------------

In [31]:
st_pid = teach.groupby('project_id').amount_usd.agg('mean').to_frame()

In [32]:
st_pid.columns

Index(['amount_usd'], dtype='object')

In [33]:
st_pid.reset_index(level=0, inplace=True)

In [34]:
st_pid.columns = ["project_id", 'mean_amount_usd_in_pid']

In [35]:
teach = pd.merge(teach, st_pid, how = 'left', on = ['project_id']).copy()
test = pd.merge(test, st_pid, how = 'left', on = ['project_id']).copy()

In [36]:
teach['average_amount_usd_in_pid'] = teach.amount_usd/ teach.mean_amount_usd_in_pid
test['average_amount_usd_in_pid'] = test.amount_usd/ test.mean_amount_usd_in_pid

In [150]:
#test.head()

In [38]:
#teach.head()

In [39]:
col_factors .extend(['mean_amount_usd_in_pid', 'average_amount_usd_in_pid'])

In [51]:
col_factors

['avg_amount_usd_in_pan', 'mean_amount_usd_in_pan', 'amount_usd']

In [41]:
replace_val = -9999

db_teach = teach[col_factors].fillna(replace_val)
db_test = test[col_factors].fillna(replace_val)

teach_d = db_teach.values
test_d = db_test.values

In [42]:
# train kNN detector
clf_name = 'KNN'
clf = KNN()
# clf = KNN(n_neighbors=5, method='largest')
clf.fit(teach_d)

KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
  metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2,
  radius=1.0)

In [43]:
test['pred'] = clf.predict(test_d)  # outlier labels (0 or 1)
test['scores'] = clf.decision_function(test_d)  # outlier scores

In [44]:
test_anomaly = test[test.pred == 1]

In [45]:
# test_anomaly

In [54]:
from sklearn.cluster import KMeans

In [110]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(teach_d)

In [111]:
kmeans.labels_

array([0, 5, 0, ..., 0, 5, 0], dtype=int32)

In [112]:
teach['pred'] = kmeans.predict(teach_d)

In [113]:
teach.pred.value_counts()

0    88606
5     9854
4     1060
2      274
6       94
8       71
1       24
7       11
9        4
3        2
Name: pred, dtype: int64

In [101]:
#pd.crosstab(index=teach.pan, columns=teach.pred, margins=True)

In [116]:
teach_pred = teach[teach.pred == 9]

In [67]:
#teach_pred

In [56]:
teach_pred.loc[:,'fd'] = teach_pred['pan'].map(lambda x: str(x)[:1])

In [56]:
#teach_pred

In [57]:
teach_pred.fd.value_counts()

4    6193
Name: fd, dtype: int64

In [58]:
from matplotlib import pyplot as plt

In [59]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10)
kmeans.fit(teach_d)
y_kmeans = kmeans.predict(teach_d)

In [60]:
#plt.scatter(teach_d[:, 0], teach_d[:, 1], c=y_kmeans, s=25, cmap='viridis')

In [61]:
centers = kmeans.cluster_centers_

In [62]:
#plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)

In [63]:
test['pred'] = kmeans.predict(test_d)

In [64]:
test['score'] = kmeans.score(test_d)

In [65]:
#test.pred.value_counts()

In [None]:
# amount_average_in_pan ---------------------------------------------------------------------------------------

In [20]:
st_pan = teach.groupby('pan').amount_usd.agg('mean').to_frame()

In [21]:
st_pan.reset_index(level=0, inplace=True)

In [22]:
st_pan.columns = ["pan", 'mean_amount_usd_in_pan']

In [23]:
teach = pd.merge(teach, st_pan, how = 'left', on = ['pan']).copy()
test = pd.merge(test, st_pan, how = 'left', on = ['pan']).copy()

In [24]:
teach['avg_amount_usd_in_pan'] = teach.amount_usd/ teach.mean_amount_usd_in_pan
test['avg_amount_usd_in_pan'] = test.amount_usd/ test.mean_amount_usd_in_pan

In [33]:
teach.head()

Unnamed: 0,id,type,project_id,created,pan,amount_usd
0,134254234,charge,224f3d3120309f994e744af6ed64dddc18a82888,2020-03-06 10:25:52,414949,5.217
1,130036244,charge,51dfe45c685dbb9ac1b650122838facfb4f209f1,2020-02-07 02:41:35,489504,200.0
2,125652873,charge,8901f0b3f6f78a91dd7837199883454456f2ae37,2020-01-12 02:21:09,402944,26.48
3,124716124,charge,224f3d3120309f994e744af6ed64dddc18a82888,2020-01-07 06:43:07,411997,1.147
4,128324238,charge,224f3d3120309f994e744af6ed64dddc18a82888,2020-01-26 15:33:34,516922,3.422


In [25]:
col_factors.extend(['mean_amount_usd_in_pan', 'avg_amount_usd_in_pan'])

In [32]:
teach = teach.drop(['mean_amount_usd_in_pan', 'avg_amount_usd_in_pan'], axis = 1)

In [26]:
col_factors = ['avg_amount_usd_in_pan', 'mean_amount_usd_in_pan', 'amount_usd']#, 'pan'

In [36]:
#teach.amount_usd.value_counts()

In [38]:
#mean(teach.amount_usd)
teach.dtypes

id             object
type           object
project_id     object
created        object
pan             int64
amount_usd    float64
dtype: object

In [41]:
sum(teach.amount_usd)

2819453.2374983616

In [44]:
teach.amount_usd.mean()

28.19453237498

In [45]:
teach.pan.mean()

483116.5287

In [47]:
k = teach.pan.mean()/ teach.amount_usd.mean()
k

17135.114080796764

In [48]:
teach.pan = teach.pan/k

In [49]:
test.pan = test.pan/k

In [79]:
#test.drop(['pred'], axis =1)

In [126]:
col_factors = ['pan', 'amount_usd']

In [127]:
replace_val = -9999

db_teach = teach[col_factors].fillna(replace_val)
db_test = test[col_factors].fillna(replace_val)

teach_d = db_teach.values
test_d = db_test.values

In [128]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(teach_d)

In [129]:
kmeans.labels_

array([0, 2, 7, ..., 0, 6, 0], dtype=int32)

In [131]:
teach['pred'] = kmeans.predict(teach_d)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [133]:
test['pred'] = kmeans.predict(test_d)

In [134]:
teach.pred.value_counts()

0    73580
7    18334
6     4474
2     2122
9      639
4      444
5      245
1       84
8       40
3       24
Name: pred, dtype: int64

In [135]:
test.pred.value_counts()

0    68123
7    21780
6     5589
2     2260
9      861
4      630
5      435
1      149
8       81
3       47
Name: pred, dtype: int64

In [137]:
#pd.crosstab(index=teach.project_id, columns=teach.pred, margins=True)

In [143]:
teach.pan.min()

13.454243660879095

In [148]:
# teach_pred = teach[teach.pred == 4] 
# teach_pred

In [91]:
# test_pred = test[test.pred == 4] 
# test_pred

In [124]:
#teach.amount_usd.sort_values()

In [121]:
teach = teach[teach.amount_usd < 5000]

In [125]:
test = test[test.amount_usd < 5000]

In [164]:
pan = teach[teach.pan <= 25]