In [4]:
import numpy as np
import pandas as pd

In [5]:
order_data = pd.read_csv('order_brush_order.csv')
order_data.head()

Unnamed: 0,orderid,shopid,userid,event_time
0,31076582227611,93950878,30530270,2019-12-27 00:23:03
1,31118059853484,156423439,46057927,2019-12-27 11:54:20
2,31123355095755,173699291,67341739,2019-12-27 13:22:35
3,31122059872723,63674025,149380322,2019-12-27 13:01:00
4,31117075665123,127249066,149493217,2019-12-27 11:37:55


In [6]:
order_data['event_time'] = pd.to_datetime(order_data.event_time)
order_data.dtypes

orderid                int64
shopid                 int64
userid                 int64
event_time    datetime64[ns]
dtype: object

In [7]:
def get_suspicious_buyer(df):
    df.sort_values(by='event_time', inplace=True)
#     print(df, end='\n\n')
    
    n = len(df.index)
    is_suspicious = [False for _ in range(n)]
        
    for i in range(n):
        maxJ = -1
        userid_set = set()        
        for j in range(i, n):
            delta_second = (df['event_time'].iloc[j] - df['event_time'].iloc[i]).total_seconds()
            if delta_second > 3600:
                break
            userid_set.add(df['userid'].iloc[j])
            if j-i+1 >= len(userid_set) * 3:
                maxJ = j            
        for j in range(i, maxJ+1):
            is_suspicious[j] = True
            
    brush_df = df.loc[is_suspicious]
#     print(brush_df, end='\n\n')
    
    user_count = brush_df.groupby('userid').orderid.count()
#     print(user_count, end='\n\n')
    
    most_suspicious_users = list(user_count[user_count == user_count.max()].index)
    most_suspicious_users.sort()
    
    res = '&'.join([str(x) for x in most_suspicious_users])
    if res == '':
        res = '0'
    return res

In [8]:
shop_groups = order_data.groupby('shopid')

suspicious_users = []
for shop_id, df in shop_groups:    
    suspicious_users.append(get_suspicious_buyer(df))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [9]:
shop_ids = []
for shop_id, df in shop_groups:
    shop_ids.append(shop_id)

output = pd.DataFrame({'shopid': shop_ids,
                       'userid': suspicious_users})
output.head(10)
# output.to_csv('submission.csv', index=False)


Unnamed: 0,shopid,userid
0,10009,0
1,10051,0
2,10061,0
3,10084,0
4,10100,0
5,10107,0
6,10108,0
7,10110,0
8,10132,0
9,10133,0


In [12]:
output[output.userid != "0"].head(100)

Unnamed: 0,shopid,userid
13,10159,214988798
40,10402,77819
57,10536,672345
111,42472,740844
114,42818,170385453
...,...,...
3133,50682734,214365114
3135,50713918,172106152
3165,50970067,179171579
3189,51134277,29857724&212200633


# Solution 2 98%

In [3]:
df = pd.read_csv('order_brush_order.csv') 
df['event_time'] = pd.to_datetime(df['event_time']) 
df = df.set_index('orderid')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
df = df.sort_values(by=['shopid', 'userid', 'event_time'])

Getting time difference between orders

This computes for the time difference between orders for orders from the same user from the same shop.

In [None]:
time_df = df.groupby(by=['shopid', 'userid']).diff()
time_df = time_df.rename(columns={'event_time':'time_diff'})
time_df\
    .merge(df, left_index=True, right_index=True)\
    .groupby(by=['shopid', 'userid'])

In [None]:
time_df.head()

Filtering by time difference

This computes for the time difference between orders for orders from the same user from the same shop, IF that difference is <= 1 hour. These are the shops and users we suspect of order brushing.

In [None]:
shorter_than_an_hour_df = time_df[time_df['time_diff'] <= '1 hour']\
    .merge(df, left_index=True, right_index=True)\
    .groupby(by=['shopid', 'userid']).count()
    
brushed_orders_df = shorter_than_an_hour_df[shorter_than_an_hour_df['time_diff'] > 1].reset_index()

In [None]:
brushed_orders_df.head()

In [None]:
#Outputing filtered results 
result = pd.DataFrame()

for shop in df['shopid'].unique():
    row = dict()
    row['shopid'] = shop
    if brushed_orders_df[brushed_orders_df['shopid'] == shop].empty:
        row['userid'] = 0
    else:  
        row['userid'] = '&'.join([str(user) for user in brushed_orders_df[brushed_orders_df['shopid'] == shop].userid])
        
    result = result.append(row, ignore_index=True)

result