In [1]:
import numpy as np
import pandas as pd
import datetime
import time
import numba
import os,sys

def LoadData(InputDir):
    """"""
    ## load raw data
    data = {
        'train': pd.read_csv('%s/round1_ijcai_18_train_20180301.txt' % InputDir, sep= ' '),
        'test': pd.read_csv('%s/round1_ijcai_18_test_a_20180301.txt' % InputDir, sep= ' '),
    }
    return data

DataBaseDir = '../../data'
DataSet = LoadData('%s/raw' % DataBaseDir)
for mod in ['train', 'test']:
    DataSet[mod]['context_timestamp'] = DataSet[mod]['context_timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(x))
    DataSet[mod] = DataSet[mod].sort_values(by= 'context_timestamp')

In [2]:
##########
# check #
#########
print(DataSet['train'].dtypes)
## sample size
print('\n')
print('train size %s, test size %s' % (len(DataSet['train']), len(DataSet['test'])))
## time range
print('\n')
for mod in ['train', 'test']:
    print('---- %s part ----' % mod)
    print(DataSet[mod]['context_timestamp'].head(5))
    print('\n')
    print(DataSet[mod]['context_timestamp'].tail(5))
## unique objects
print('\n')
for mod in ['train', 'test']:
    print('---- %s part ----' % mod)
    for ob in ['item_id', 'user_id', 'context_id', 'shop_id']:
        print('%s unique length %s' % (ob, len(DataSet[mod][ob].unique())))
## remove repeated context
RepeatedContext = list()
UniqueContext = set()
for v in DataSet['train']['context_id'].values:
    if(v in UniqueContext):
        RepeatedContext.append(v)
    else:
        UniqueContext.add(v)
print('\n')
print('repeated context size %s' % len(RepeatedContext))
DataSet['train'] = DataSet['train'][~DataSet['train']['context_id'].isin(RepeatedContext)]
print('after removing repeated context, size %s' % len(DataSet['train']))
## null size
print('\n')
for col in DataSet['train'].columns:
    coldt = DataSet['train'][col].dtype.name
    if((coldt == 'int64') or (coldt == 'float64')):
        nullsize = len(DataSet['train'][DataSet['train'][col] == -1])
        if(nullsize > 0):
            print('%s null size %s' % (col, nullsize))
## target rate
print('\n')
for mod in ['train', 'test']:
    DataSet[mod]['hour'] = DataSet[mod]['context_timestamp'].dt.hour ## new column, feature
    DataSet[mod]['date'] = DataSet[mod]['context_timestamp'].dt.date ## new column
## 
for i in range(7):
    daydf = DataSet['train'][DataSet['train']['date'] == (datetime.date(2018, 9, 18) + datetime.timedelta(days= i))]
    posnum = len(daydf[daydf['is_trade'] == 1])
    print('day %s positive rate %.2f' % (i, (posnum/len(daydf))))
print('\n')
daydf = DataSet['train'][DataSet['train']['date'] == (datetime.date(2018, 9, 18))]
for i in range(24):
    hourdf = daydf[daydf['hour'] == i]
    posnum = len(hourdf[hourdf['is_trade'] == 1])
    print('hour %s(2018/9/18) positive rate %.2f' % (i, (posnum/len(hourdf))))

instance_id                           int64
item_id                               int64
item_category_list                   object
item_property_list                   object
item_brand_id                         int64
item_city_id                          int64
item_price_level                      int64
item_sales_level                      int64
item_collected_level                  int64
item_pv_level                         int64
user_id                               int64
user_gender_id                        int64
user_age_level                        int64
user_occupation_id                    int64
user_star_level                       int64
context_id                            int64
context_timestamp            datetime64[ns]
context_page_id                       int64
predict_category_property            object
shop_id                               int64
shop_review_num_level                 int64
shop_review_positive_rate           float64
shop_star_level                 

In [3]:
#### item part
## item_category_list, dependently joined, treat it as an entire entity
@numba.jit
def ApplySplitCategory(colvals):
    """"""
    n = len(colvals)
    result = np.empty((n, 3), dtype= 'object')
    for i in range(n):
        cate_list = colvals[i].split(';')
        for j in range(3):
            if(j < len(cate_list)):
                result[i, j] = cate_list[j]
            else:
                result[i, j] = ''
    return result

for mod in ['train', 'test']:
    print('\n')
    print('---- %s ----' % mod)
    tmp = ApplySplitCategory(DataSet[mod]['item_category_list'].values) 
    tmpdf = pd.DataFrame(data= tmp, index= DataSet[mod].index, columns=['category_0', 'category_1', 'category_2'])
    DataSet[mod] = pd.concat([DataSet[mod], tmpdf], axis= 1, ignore_index= False)
    for col in ['category_0', 'category_1', 'category_2']:
        print('\n')
        print(DataSet[mod][col].value_counts())
# ## item_property_list, independently joined, treat it seperately
# for mod in ['train', 'test']:
#     print('\n')
#     print('---- %s ----' % mod)
#     tmp = ApplySplitCategory(DataSet[mod]['item_property_list'].values)
#     tmpdf = pd.DataFrame(data= tmp, index= range(len(tmp)), columns=['property_0', 'property_1', 'property_2'])
#     df = pd.concat([DataSet[mod], tmpdf], axis= 1)
#     for col in ['property_0', 'property_1', 'property_2']:
#         print('\n')
#         print(df[col].value_counts())

## item_brand_id, item_city_id
item_cate_feat = ['item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']
for mod in ['train', 'test']:
    print('\n')
    print('---- %s ----' % mod)
    for col in item_cate_feat:
        print('%s unique size %s' % (col, len(DataSet[mod][col].unique())))
        DataSet[mod][col] = DataSet[mod][col].astype('object')



---- train ----


7908382889764677758    478086
Name: category_0, dtype: int64


8277336076276184272    150771
5755694407684602296    102492
509660095530134768      75416
5799347067982556520     72009
7258015885215914736     53029
2011981573061447208      9563
8710739180200009128      7084
3203673979138763595      2246
2642175453151805566      2029
2436715285093487584      1966
4879721024980945592      1207
1968056100269760729       186
22731265849056483          88
Name: category_1, dtype: int64


                       476057
8868887661186419229      1906
6233669177166538628       123
Name: category_2, dtype: int64


---- test ----


7908382889764677758    18371
Name: category_0, dtype: int64


8277336076276184272    6098
5755694407684602296    4033
509660095530134768     2932
5799347067982556520    2726
7258015885215914736    1464
2011981573061447208     373
8710739180200009128     304
3203673979138763595     223
2642175453151805566      79
2436715285093487584      72
487972102498

In [4]:
#### user part
## user_gender_id, user_age_level, user_occupation_id, user_star_level
user_cate_feat = ['user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']
for mod in ['train', 'test']:
    print('\n')
    print('---- %s ----' % mod)
    for col in user_cate_feat:
        print('%s unique size %s' % (col, len(DataSet[mod][col].unique())))
        DataSet[mod][col] = DataSet[mod][col].astype('object')



---- train ----
user_gender_id unique size 4
user_age_level unique size 9
user_occupation_id unique size 5
user_star_level unique size 12


---- test ----
user_gender_id unique size 4
user_age_level unique size 9
user_occupation_id unique size 5
user_star_level unique size 12


In [5]:
#### shop part
## shop_review_num_level, shop_star_level 
shop_cate_feat = ['shop_review_num_level', 'shop_star_level']
shop_num_feat = ['shop_review_positive_rate', 'shop_score_service', 'shop_score_delivery', 'shop_score_description']
for mod in ['train', 'test']:
    print('\n')
    print('---- %s ----' % mod)
    for col in shop_cate_feat:
        print('%s unique size %s' % (col, len(DataSet[mod][col].unique())))
        DataSet[mod][col] = DataSet[mod][col].astype('object')
#print(DataSet['train'][user_cate_feat].head(20))



---- train ----
shop_review_num_level unique size 25
shop_star_level unique size 22


---- test ----
shop_review_num_level unique size 21
shop_star_level unique size 20


In [10]:
#### context part
predict_cate = set()
predict_prop = set()
@numba.jit
def ApplyPredictCategory(colvals):
    """"""
    n = len(colvals)
    result = np.empty((n, 1), dtype= 'object')
    for i in range(n):
        if(colvals[i] != '-1'):
            cate_prop_list = colvals[i].split(';')
            cate_list = []
            for cpl in cate_prop_list:
                tmp = cpl.split(':')
                cate = tmp[0]
                cate_list.append(cate)
            result[i] = ','.join(cate_list)
        else:
            result[i] = '-1'
    return result

@numba.jit
def ApplyHitCategory(cate0, cate1, cate2, colvals):
    """"""
    n = len(colvals)
    result = np.zeros((n, 1), dtype= 'int8')
    for i in range(n):
        if(colvals[i] == '-1'):
            result[i] = 1
            continue
        cate_list = colvals[i].split(',')
        if(cate0[i] in cate_list):
            result[i] = 1
            continue
        elif(cate1[i] in cate_list):
            result[i] = 1
            continue
        elif(cate2[i] in cate_list):
            result[i] = 1
            continue
    return result

DataSet['train']['predict_category_list'] = ApplyPredictCategory(DataSet['train']['predict_category_property'].values)
DataSet['train']['hit_category'] = ApplyHitCategory(DataSet['train']['category_0'].values, 
                                                    DataSet['train']['category_1'].values, 
                                                    DataSet['train']['category_2'].values, 
                                                    DataSet['train']['predict_category_list'].values)
posdf = DataSet['train'][DataSet['train']['is_trade'] == 1]
posvc = pospdf['hit_category'].value_counts()
print(posvc[1]/len(posdf))
totalvc = DataSet['train']['hit_category'].value_counts()
print(totalvc[1]/len(DataSet['train']))
#tmpdf1 = tmpdf[[tmpdf['hit'] == 1]]
#print(len(tmpdf1)/len(tmpdf))
# 7908382889764677758
#         try:
#             prop = tmp[1]
#             for p in prop.split(','):
#                 predict_prop.add(p)
#                 if(p in tmpset):
#                     print(cate)
#                 else:
#                     tmpset.add(p)
#         except:
#             print(tmp)
#         predict_cate.add(cate)
# print(len(predict_cate))
# print(len(predict_prop))
# sys.exit(1)
# context_cate_feat = ['context_page_id']
# for mod in ['train', 'test']:
#     print('\n')
#     print('---- %s ----' % mod)
#     for col in context_cate_feat:
#         print('%s unique size %s' % (col, len(DataSet[mod][col].unique())))
#         DataSet[mod][col] = DataSet[mod][col].astype('object')
        
# pcp = DataSet['train']['item_category_list'][:20].values
# print(pcp)
#print('------------------')
#print(DataSet['train'][['item_category_list', 'item_property_list']].head(20))

0.99921953395
0.998632045281
