In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import time,os
import lightgbm as lgb
from tqdm import tqdm
from geohash import encode
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ParameterGrid
from sklearn.feature_selection import VarianceThreshold

%config InlineBackend.figure_format = 'retina' # set 'png' here when working on notebook
%matplotlib inline

## 加载数据

In [2]:
# 将wifi_infos转成dict
def make_wifi_dict(df):
    def _split(x):
        x=x.split('|')[:2]
        x[1]=int(x[1])
        return x
    df.wifi_infos=df.wifi_infos.apply(lambda x:dict([_split(i) for i in x.split(';')]))
    return df

# 提取时间结构信息
def make_struct_time(df):
    time_infos=[]
    for t in tqdm(df.time_stamp):
        _struct_time=time.strptime(t,'%Y-%m-%d %H:%M')
        time_infos.append([_struct_time.tm_hour+_struct_time.tm_min/60.,_struct_time.tm_hour,_struct_time.tm_wday])
    time_infos=pd.DataFrame(time_infos,columns=['time','hour','wday'])
    df=pd.concat((df,time_infos),axis=1)
    return df

def load_data(file_path):
    result=pd.read_csv(file_path)
    result=make_wifi_dict(result)
    result=make_struct_time(result)
    return result

meta_df=pd.read_csv('data/训练数据-ccf_first_round_shop_info.csv')
train_df=load_data('data/训练数据-ccf_first_round_user_shop_behavior.csv')
test_df=load_data('data/AB榜测试集-evaluation_public.csv')
# 加入商店信息
train_df=pd.merge(train_df,meta_df[['shop_id','mall_id']],on='shop_id',how='left')

100%|██████████| 1138015/1138015 [00:11<00:00, 102994.63it/s]
100%|██████████| 483931/483931 [00:04<00:00, 100900.53it/s]


In [3]:
x_train=train_df

In [3]:
# 构造线下验证集
x_train=train_df[train_df.time_stamp<'2017-08-30 00:00']
x_val=train_df[train_df.time_stamp>='2017-08-30 00:00']

In [4]:
x_train.head()

Unnamed: 0,user_id,shop_id,time_stamp,longitude,latitude,wifi_infos,time,hour,wday,mall_id
0,u_376,s_2871718,2017-08-06 21:20,122.308291,32.08804,"{'b_5857370': -68, 'b_56326644': -89, 'b_28723...",21.333333,21,6,m_1409
1,u_376,s_2871718,2017-08-06 21:20,122.308162,32.08797,"{'b_6396479': -57, 'b_5857370': -68, 'b_563281...",21.333333,21,6,m_1409
2,u_1041,s_181637,2017-08-02 13:10,117.365255,40.638214,"{'b_8006442': -71, 'b_33503892': -56, 'b_80063...",13.166667,13,2,m_4079
3,u_1158,s_609470,2017-08-13 12:30,121.134451,31.197416,"{'b_26250581': -64, 'b_52934996': -79, 'b_2625...",12.5,12,6,m_6587
4,u_1654,s_3816766,2017-08-25 19:50,122.255867,31.35132,"{'b_54466444': -86, 'b_21685901': -91, 'b_3900...",19.833333,19,4,m_3005


## 添加特征

In [5]:
# 计算两点之间距离
def cal_distance(lat1, lon1, lat2, lon2):
    dx = np.abs(lon1 - lon2)  # 经度差
    dy = np.abs(lat1 - lat2)  # 维度差
    b = (lat1 + lat2) / 2.0
    Lx = 6371004.0 * (dx / 57.2958) * np.cos(b / 57.2958)
    Ly = 6371004.0 * (dy / 57.2958)
    L = (Lx ** 2 + Ly ** 2) ** 0.5
    return L

# 计算wifi稀疏矩阵
def add_one_hot_wifi(df1,df2):
    vec = DictVectorizer()
    vec.fit(df1.wifi_infos)
    result=vec.transform(df2.wifi_infos).toarray()
    return result

# 计算geo稀疏矩阵/定位经纬度
def add_one_hot_geo(df):
    return df[['longitude','latitude']].values

# 增加时间特征
def add_time(df):
    return df[['time','wday']].values

def add_feats(df1,df2):
    feats=[]
    feats.append(add_one_hot_wifi(df1,df2))
    feats.append(add_one_hot_geo(df2))
#     feats.append(add_time(df2))
#     feats=np.concatenate(feats,axis=1)
    return feats

## 模型训练

In [6]:
result=[]
precision=[]

# for mall_id in tqdm(['m_7800','m_690','m_7168','m_6337','m_1377']):
for mall_id in tqdm(meta_df.mall_id.unique()):
    _x_train=x_train[x_train.mall_id==mall_id]
#     _x_val=x_val[x_val.mall_id==mall_id] #
    _test_df=test_df[test_df.mall_id==mall_id]
    _meta_df=meta_df[meta_df.mall_id==mall_id]
    # 构造特征
    train_feat=add_feats(_x_train,_x_train)
#     val_feat=add_feats(_x_train,_x_val) #
    test_feat=add_feats(_x_train,_test_df)
    # 降维
    nb_wifi_column=train_feat[0].shape[1]
    sel = VarianceThreshold(threshold=1)
    train_feat[0]=sel.fit_transform(train_feat[0])
#     val_feat[0]=sel.transform(val_feat[0]) #
    test_feat[0]=sel.transform(test_feat[0])
    train_feat=np.concatenate(train_feat,axis=1)
#     val_feat=np.concatenate(val_feat,axis=1) #
    test_feat=np.concatenate(test_feat,axis=1)
    print('wifi nb_columns:{}->{}'.format(nb_wifi_column,train_feat.shape[1]))
    # 添加标签
    encoder=LabelEncoder()
    encoder.fit(_meta_df.shop_id)
    train_label=encoder.transform(_x_train.shop_id)
#     val_label=encoder.transform(_x_val.shop_id) #
    # 训练
    nb_class=_meta_df.shape[0]
    params = {
        'num_class':[nb_class],
        'objective': ['multiclass'],
        'metric':['multi_error'],
        'learning_rate':[0.05],
        'feature_fraction': [0.6],
        'max_depth': [12],# 14
        'num_leaves':[180],# 256
        'bagging_fraction': [0.6],
        'bagging_freq':[5],
        'min_data_in_leaf':[10],
        'min_gain_to_split':[0],
        'lambda_l1':[1],
        'lambda_l2':[1],
        'verbose':[0],
        'is_unbalance':[True]
    }
    params=list(ParameterGrid(params))
    lgb_train=lgb.Dataset(train_feat,train_label)
    model = lgb.train(params[0], lgb_train, num_boost_round=150)
    # 模型验证
#     proba=model.predict(val_feat)
#     predict_label=np.argmax(proba,axis=1)
#     ans=pd.DataFrame({'pred':predict_label,'true':val_label})
#     score={mall_id:ans[ans.pred==ans.true].shape[0]/ans.shape[0]}
#     precision.append(score)
#     print(score)
    # 预测
#     proba=model.predict(test_feat)
#     predict_label=np.argmax(proba,axis=1)
#     ans=pd.DataFrame({'row_id':_test_df.row_id,'shop_id':encoder.inverse_transform(predict_label)})
#     result.append(ans)
    # 保存候选集
    nb_candidates=7
    proba=model.predict(train_feat)
    candidates=encoder.inverse_transform(proba.argsort(axis=1)[:,-nb_candidates:])
    ans=pd.DataFrame(candidates,index=_x_train.index)
    ans.to_hdf('cache/multi/{}_train_full.hdf'.format(mall_id),'data')
    proba.sort(axis=1)
    ans=pd.DataFrame(proba[:,-nb_candidates:],index=_x_train.index)
    ans.to_hdf('cache/multi/{}_train_full.hdf'.format(mall_id),'proba')
    ###
#     proba=model.predict(val_feat)
#     candidates=encoder.inverse_transform(proba.argsort(axis=1)[:,-nb_candidates:])
#     ans=pd.DataFrame(candidates,index=_x_val.index)
#     ans.to_hdf('cache/multi/{}_val.hdf'.format(mall_id),'data')
#     proba.sort(axis=1)
#     ans=pd.DataFrame(proba[:,-nb_candidates:],index=_x_val.index)
#     ans.to_hdf('cache/multi/{}_val.hdf'.format(mall_id),'proba')
    ###
    proba=model.predict(test_feat)
    candidates=encoder.inverse_transform(proba.argsort(axis=1)[:,-nb_candidates:])
    ans=pd.DataFrame(candidates,index=_test_df.row_id)
    ans.to_hdf('cache/multi/{}_test.hdf'.format(mall_id),'data')
    proba.sort(axis=1)
    ans=pd.DataFrame(proba[:,-nb_candidates:],index=_test_df.row_id)
    ans.to_hdf('cache/multi/{}_test.hdf'.format(mall_id),'proba')

  0%|          | 0/97 [00:00<?, ?it/s]

wifi nb_columns:13294->1772


  1%|          | 1/97 [00:44<1:10:39, 44.16s/it]

wifi nb_columns:2701->1475


  2%|▏         | 2/97 [00:59<56:27, 35.65s/it]  

wifi nb_columns:4461->1603


  3%|▎         | 3/97 [01:18<47:55, 30.59s/it]

wifi nb_columns:5217->2282


  4%|▍         | 4/97 [01:32<39:22, 25.40s/it]

wifi nb_columns:5871->1680


  5%|▌         | 5/97 [02:00<40:31, 26.43s/it]

wifi nb_columns:4485->1748


  6%|▌         | 6/97 [02:19<36:19, 23.96s/it]

wifi nb_columns:9398->2541


  7%|▋         | 7/97 [02:55<41:43, 27.81s/it]

wifi nb_columns:5483->1478


  8%|▊         | 8/97 [03:27<43:06, 29.06s/it]

wifi nb_columns:4242->1745


  9%|▉         | 9/97 [03:48<38:57, 26.56s/it]

wifi nb_columns:1552->1474


 10%|█         | 10/97 [03:52<28:47, 19.85s/it]

wifi nb_columns:6298->2061


 11%|█▏        | 11/97 [08:48<2:27:13, 102.72s/it]

wifi nb_columns:10134->2118


 12%|█▏        | 12/97 [09:26<1:57:43, 83.10s/it] 

wifi nb_columns:4566->2179


 13%|█▎        | 13/97 [09:41<1:27:40, 62.63s/it]

wifi nb_columns:7852->1483


 14%|█▍        | 14/97 [10:05<1:10:41, 51.10s/it]

wifi nb_columns:2759->1564


 15%|█▌        | 15/97 [10:12<52:00, 38.06s/it]  

wifi nb_columns:9505->2868


 16%|█▋        | 16/97 [10:34<44:46, 33.17s/it]

wifi nb_columns:2187->1712


 18%|█▊        | 17/97 [10:42<34:07, 25.59s/it]

wifi nb_columns:7978->3143


 19%|█▊        | 18/97 [11:22<39:19, 29.87s/it]

wifi nb_columns:3280->1952


 20%|█▉        | 19/97 [11:32<31:10, 23.98s/it]

wifi nb_columns:4606->1620


 21%|██        | 20/97 [12:03<33:27, 26.07s/it]

wifi nb_columns:4543->1422


 22%|██▏       | 21/97 [12:18<28:56, 22.85s/it]

wifi nb_columns:5159->2235


 23%|██▎       | 22/97 [12:33<25:30, 20.41s/it]

wifi nb_columns:4110->1968


 24%|██▎       | 23/97 [12:46<22:24, 18.17s/it]

wifi nb_columns:6699->1671


 25%|██▍       | 24/97 [13:32<32:05, 26.38s/it]

wifi nb_columns:1973->1837


 26%|██▌       | 25/97 [13:37<24:12, 20.18s/it]

wifi nb_columns:4298->2256


 27%|██▋       | 26/97 [13:54<22:32, 19.05s/it]

wifi nb_columns:4375->1486


 28%|██▊       | 27/97 [14:08<20:25, 17.51s/it]

wifi nb_columns:1865->1728


 29%|██▉       | 28/97 [14:13<16:00, 13.92s/it]

wifi nb_columns:3510->2349


 30%|██▉       | 29/97 [14:22<13:54, 12.28s/it]

wifi nb_columns:5708->2098


 31%|███       | 30/97 [14:39<15:34, 13.94s/it]

wifi nb_columns:4203->1862


 32%|███▏      | 31/97 [14:50<14:16, 12.97s/it]

wifi nb_columns:3250->1674


 33%|███▎      | 32/97 [15:04<14:11, 13.11s/it]

wifi nb_columns:3983->1728


 34%|███▍      | 33/97 [15:14<13:15, 12.42s/it]

wifi nb_columns:2224->1555


 35%|███▌      | 34/97 [15:20<11:00, 10.48s/it]

wifi nb_columns:4405->2195


 36%|███▌      | 35/97 [15:29<10:11,  9.86s/it]

wifi nb_columns:3218->1732


 37%|███▋      | 36/97 [15:37<09:25,  9.27s/it]

wifi nb_columns:2212->2036


 38%|███▊      | 37/97 [15:41<07:55,  7.92s/it]

wifi nb_columns:2583->1790


 39%|███▉      | 38/97 [15:50<07:57,  8.09s/it]

wifi nb_columns:3422->1806


 40%|████      | 39/97 [16:02<08:51,  9.16s/it]

wifi nb_columns:3819->1736


 41%|████      | 40/97 [16:12<09:07,  9.60s/it]

wifi nb_columns:5369->1757


 42%|████▏     | 41/97 [16:34<12:15, 13.13s/it]

wifi nb_columns:3126->2258


 43%|████▎     | 42/97 [16:46<11:47, 12.86s/it]

wifi nb_columns:4592->2119


 44%|████▍     | 43/97 [16:57<11:01, 12.25s/it]

wifi nb_columns:3078->1838


 45%|████▌     | 44/97 [17:09<10:45, 12.18s/it]

wifi nb_columns:3780->1739


 46%|████▋     | 45/97 [17:20<10:19, 11.91s/it]

wifi nb_columns:5410->1693


 47%|████▋     | 46/97 [17:55<16:07, 18.97s/it]

wifi nb_columns:4399->1804


 48%|████▊     | 47/97 [18:11<15:01, 18.04s/it]

wifi nb_columns:3067->1561


 49%|████▉     | 48/97 [18:22<12:51, 15.74s/it]

wifi nb_columns:2646->2398


 51%|█████     | 49/97 [18:27<10:12, 12.75s/it]

wifi nb_columns:4025->2226


 52%|█████▏    | 50/97 [18:36<08:57, 11.44s/it]

wifi nb_columns:2926->1957


 53%|█████▎    | 51/97 [18:46<08:28, 11.05s/it]

wifi nb_columns:5207->2115


 54%|█████▎    | 52/97 [19:04<09:46, 13.04s/it]

wifi nb_columns:3017->1897


 55%|█████▍    | 53/97 [19:11<08:13, 11.22s/it]

wifi nb_columns:1839->1375


 56%|█████▌    | 54/97 [19:17<07:04,  9.88s/it]

wifi nb_columns:2719->2240


 57%|█████▋    | 55/97 [19:25<06:27,  9.22s/it]

wifi nb_columns:3368->1610


 58%|█████▊    | 56/97 [19:35<06:29,  9.51s/it]

wifi nb_columns:4040->1644


 59%|█████▉    | 57/97 [19:49<07:16, 10.92s/it]

wifi nb_columns:4922->1791


 60%|█████▉    | 58/97 [20:25<11:51, 18.25s/it]

wifi nb_columns:5378->2689


 61%|██████    | 59/97 [20:42<11:22, 17.95s/it]

wifi nb_columns:3585->1735


 62%|██████▏   | 60/97 [20:57<10:32, 17.09s/it]

wifi nb_columns:4723->2557


 63%|██████▎   | 61/97 [21:13<09:58, 16.62s/it]

wifi nb_columns:4467->1954


 64%|██████▍   | 62/97 [21:22<08:22, 14.35s/it]

wifi nb_columns:4415->1633


 65%|██████▍   | 63/97 [21:37<08:15, 14.56s/it]

wifi nb_columns:3693->2757


 66%|██████▌   | 64/97 [21:45<07:02, 12.79s/it]

wifi nb_columns:4723->2049


 67%|██████▋   | 65/97 [21:59<06:53, 12.92s/it]

wifi nb_columns:3863->1867


 68%|██████▊   | 66/97 [22:07<06:01, 11.67s/it]

wifi nb_columns:6189->1672


 69%|██████▉   | 67/97 [22:48<10:10, 20.34s/it]

wifi nb_columns:2043->1479


 70%|███████   | 68/97 [22:56<08:04, 16.70s/it]

wifi nb_columns:2764->2003


 71%|███████   | 69/97 [23:03<06:21, 13.64s/it]

wifi nb_columns:4068->2222


 72%|███████▏  | 70/97 [23:11<05:24, 12.01s/it]

wifi nb_columns:1934->1578


 73%|███████▎  | 71/97 [23:19<04:42, 10.85s/it]

wifi nb_columns:2712->1924


 74%|███████▍  | 72/97 [23:28<04:14, 10.17s/it]

wifi nb_columns:2986->1874


 75%|███████▌  | 73/97 [23:36<03:50,  9.61s/it]

wifi nb_columns:4416->2127


 76%|███████▋  | 74/97 [23:52<04:27, 11.64s/it]

wifi nb_columns:4312->1750


 77%|███████▋  | 75/97 [24:03<04:08, 11.28s/it]

wifi nb_columns:2514->1566


 78%|███████▊  | 76/97 [24:12<03:45, 10.72s/it]

wifi nb_columns:2966->1431


 79%|███████▉  | 77/97 [24:22<03:28, 10.43s/it]

wifi nb_columns:3248->2106


 80%|████████  | 78/97 [24:31<03:10, 10.03s/it]

wifi nb_columns:5299->2069


 81%|████████▏ | 79/97 [24:50<03:51, 12.88s/it]

wifi nb_columns:5859->1626


 82%|████████▏ | 80/97 [25:13<04:26, 15.69s/it]

wifi nb_columns:1013->1002


 84%|████████▎ | 81/97 [25:24<03:50, 14.43s/it]

wifi nb_columns:3798->1575


 85%|████████▍ | 82/97 [25:37<03:31, 14.09s/it]

wifi nb_columns:2326->1802


 86%|████████▌ | 83/97 [25:45<02:49, 12.11s/it]

wifi nb_columns:3441->2477


 87%|████████▋ | 84/97 [25:53<02:21, 10.90s/it]

wifi nb_columns:4394->2046


 88%|████████▊ | 85/97 [26:04<02:10, 10.89s/it]

wifi nb_columns:5331->1845


 89%|████████▊ | 86/97 [26:29<02:47, 15.27s/it]

wifi nb_columns:2334->1346


 90%|████████▉ | 87/97 [26:45<02:34, 15.50s/it]

wifi nb_columns:8261->2344


 91%|█████████ | 88/97 [27:12<02:50, 18.97s/it]

wifi nb_columns:4274->1992


 92%|█████████▏| 89/97 [27:25<02:16, 17.12s/it]

wifi nb_columns:1739->1415


 93%|█████████▎| 90/97 [27:33<01:40, 14.40s/it]

wifi nb_columns:2356->2140


 94%|█████████▍| 91/97 [27:40<01:11, 11.97s/it]

wifi nb_columns:2923->1939


 95%|█████████▍| 92/97 [27:50<00:58, 11.60s/it]

wifi nb_columns:2115->1526


 96%|█████████▌| 93/97 [27:58<00:41, 10.30s/it]

wifi nb_columns:3456->1363


 97%|█████████▋| 94/97 [28:18<00:40, 13.35s/it]

wifi nb_columns:2731->2116


 98%|█████████▊| 95/97 [28:24<00:22, 11.23s/it]

wifi nb_columns:4779->1648


 99%|█████████▉| 96/97 [28:40<00:12, 12.50s/it]

wifi nb_columns:3654->1530


100%|██████████| 97/97 [29:15<00:00, 19.35s/it]


In [8]:
# 线下验证
tmp=pd.concat([pd.Series(i) for i in precision])
tmp=pd.DataFrame({'mall_id':tmp.index,'precision':tmp.values})
t2=x_val[['mall_id','shop_id']].groupby('mall_id',as_index=False)['shop_id'].agg({'count':'count'})
t=pd.merge(tmp,t2,on='mall_id',how='inner')
sum(t.precision*t['count'])/sum(t['count']) 
#150：0.90364919322725024
#early stopping：0.90777464519520235

0.90463675315780256

In [62]:
# 保存结果
tmp=pd.concat(result)
print(tmp.shape)
tmp.to_csv('result.csv',index=False,sep=',')

(483931, 2)


ideas:

1.early_stopping多分类能否提高（貌似没有改善）

2.增大候选集到7个（貌似没有改善）/减少到3个

3.用户历史

4.二分类加入多分类概率信息

## EDA
1.很多错误答案和正确答案的shop经纬度一样

2.很多错误答案的置信度分布并不极端，分类器并不自信

In [138]:
train_proba=model.predict(train_feat)
val_proba=model.predict(val_feat)

In [140]:
pd.DataFrame(encoder.inverse_transform(val_proba.argsort(axis=1)[:,-5:]))

Unnamed: 0,0,1,2,3,4
0,s_679110,s_479529,s_712117,s_521874,s_472478
1,s_685227,s_992598,s_683217,s_682074,s_684555
2,s_682074,s_683674,s_683671,s_683053,s_684547
3,s_698556,s_473743,s_539726,s_681760,s_685058
4,s_809873,s_539726,s_3382317,s_696563,s_461325
5,s_539726,s_683832,s_1462898,s_685058,s_1114306
6,s_679110,s_683678,s_685058,s_466068,s_774995
7,s_683671,s_683821,s_557771,s_2248290,s_635423
8,s_3382317,s_809873,s_683217,s_696563,s_461325
9,s_683985,s_465792,s_698556,s_539726,s_685058


In [146]:
val_proba.sort(axis=1)
pd.DataFrame(val_proba[:,-5:])

Unnamed: 0,0,1,2,3,4
0,0.003684,0.004195,0.004405,0.009055,0.950170
1,0.020446,0.031947,0.125492,0.317416,0.410895
2,0.063578,0.069018,0.123614,0.140901,0.298582
3,0.007020,0.007237,0.007717,0.013129,0.887729
4,0.000812,0.001812,0.002056,0.002456,0.975247
5,0.000688,0.000830,0.000881,0.001080,0.987465
6,0.000191,0.000213,0.000249,0.000264,0.993952
7,0.000455,0.000478,0.000830,0.005798,0.983089
8,0.004922,0.007463,0.007641,0.007736,0.922186
9,0.000608,0.001486,0.002664,0.011097,0.976144
