In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import gc
import time
from tqdm import tqdm
import sys
import requests
import json
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
app_data = pd.read_csv('../data/app.csv')
sample_data = pd.read_csv('../data/sample.csv')
test_data = pd.read_csv('../data/test.csv')
train_data = pd.read_csv('../data/train.csv')
user_data = pd.read_csv('../data/user.csv')

In [3]:
pickle_path = "../pickle"
if not os.path.exists(pickle_path):
    os.mkdir(pickle_path)

# 处理app_data

In [4]:
'''
app.csv包含114584条数据，
2个字段：deviceid，applist
deviceid：用户设备id 
applist：用户所拥有的app
我们已将app的名字设置成了app_1,app_2..的形式。
'''
app_data.head()

Unnamed: 0,deviceid,applist
0,832aaa33cdf4a0938ba2c795eb3ffefd,[app_1 app_2 app_3 app_4 app_5 app_6 app_7 app...
1,67dd9dac18cce1a6d79e8f20eefd98ab,[app_84 app_85 app_4 app_5 app_86 app_87 app_8...
2,ddaa88b573f0ec579486de4df7852871,[app_133 ]
3,132cc4746b2ca645b37d64717bf2ccbd,[app_133 ]
4,19ffd9b567a0a0863a72aee342d2ce9d,[app_1 app_2 app_3 app_4 app_5 app_6 app_7 app...


In [5]:
app_data.isnull().any()

deviceid    False
applist     False
dtype: bool

In [6]:
def flatten_app(df):    
    u = []
    a = []
    for i in tqdm(range(len(df['deviceid'].values))):
        u += [df['deviceid'].values[i]]*df['app_len'].values[i]
        a += list(df['applist'].values[i])
        
    new_df = pd.DataFrame()
    new_df['deviceid'] = u
    new_df['applist'] = a
        
    return new_df

In [49]:
new_app_data = app_data.groupby(by='deviceid').sum() 

In [50]:
new_app_data.head()

Unnamed: 0_level_0,applist
deviceid,Unnamed: 1_level_1
000046581b8a28c431be90c278674925,[app_133 ][app_1 ]
00016381ab699d4e76dc99291e79e7a1,[app_133 ]
0001c7e6a85a3a4498fe0c5f29f3a379,[app_133 ]
000207c515d01c00e9144c6866b546a7,[app_133 ][app_1 ]
000355d66e3fe127c8c2dd1ef60322a3,[app_84 app_85 app_4 app_5 app_86 app_87 app_8...


In [51]:
new_app_data['applist'] = new_app_data['applist'].map(lambda x:x.replace('][','').replace('[','').replace(' ]','').split(' '))
new_app_data['app_len'] =  new_app_data['applist'].map(lambda x:len(x))
new_app_data.reset_index(inplace = True)
deal_new_app_data = flatten_app(new_app_data)

100%|██████████| 114584/114584 [00:00<00:00, 126233.81it/s]


In [52]:
new_app_data['deviceid'].unique().shape

(114584,)

In [53]:
new_app_data.shape

(114584, 3)

In [54]:
deal_new_app_data

Unnamed: 0,deviceid,applist
0,000046581b8a28c431be90c278674925,app_133
1,000046581b8a28c431be90c278674925,app_1
2,00016381ab699d4e76dc99291e79e7a1,app_133
3,0001c7e6a85a3a4498fe0c5f29f3a379,app_133
4,000207c515d01c00e9144c6866b546a7,app_133
...,...,...
2163211,ffff31901148627b225fbb434e19ab06,app_83
2163212,ffff31901148627b225fbb434e19ab06,app_1
2163213,ffffcb1db65dd1fdc4d09f8c3bbab2ea,app_133
2163214,ffffe5944124a7b7656faa8610ba58d9,app_133


In [56]:
if not os.path.exists("{}/device_new_app.pickle".format(pickle_path)):
    t1 = time.time()
    new_app_data.to_pickle("{}/device_new_app.pickle".format(pickle_path))
    print('ACTIVE TO PICKLE: ',time.time()-t1)

ACTIVE TO PICKLE:  0.5183968544006348


In [55]:
if not os.path.exists("{}/deal_device_new_app.pickle".format(pickle_path)):
    t1 = time.time()
    deal_new_app_data.to_pickle("{}/deal_device_new_app.pickle".format(pickle_path))
    print('ACTIVE TO PICKLE: ',time.time()-t1)

ACTIVE TO PICKLE:  0.5924808979034424


# sample_data 提交示例

In [4]:
'''
sample.csv为提交答案的格式，
id列和test.csv中的id列是对应关系，
target列为您预测的该视频是否会被点击，
1为点击0为未点击。 
'''
sample_data.head()

Unnamed: 0,id,target
0,test_1,0
1,test_2,1
2,test_3,0
3,test_4,1
4,test_5,0


# ｔｅｓｔ　　ｄａｔａ

In [4]:
'''
test.csv包含3653592条数据
13个字段：id,deviceid,newsid,guid,
        pos,app_version,device_vendor,
        netmodel,osversion,lng,lat,
        device_version,ts 
id：代表数据集的第几条数据，从test_1到test_3653592。 
deviceid：用户的设备id。 
newsid：视频的id。 
guid：用户的注册id。 
pos：视频推荐位置。 
app_version：app版本。 
device_vendor：设备厂商。 
netmodel：网络类型。 
osversion：操作系统版本。 
lng：经度。 
lat：维度。 
device_version：设备版本。 
ts：视频暴光给用户的时间戳。
'''
test_data.head()

Unnamed: 0,id,deviceid,newsid,guid,pos,app_version,device_vendor,netmodel,osversion,lng,lat,device_version,ts
0,test_1,67dd9dac18cce1a6d79e8f20eefd98ab,1154231839239097584,625dc45744f59ddbc3ec8df161217188,1,2.1.1,xiaomi,w,9,116.750876,36.56831,Redmi Note 7,1573421928698
1,test_2,67dd9dac18cce1a6d79e8f20eefd98ab,1703600900425186754,625dc45744f59ddbc3ec8df161217188,0,2.1.1,xiaomi,o,9,116.750867,36.56832,Redmi Note 7,1573421928703
2,test_3,67dd9dac18cce1a6d79e8f20eefd98ab,5061374784471997927,625dc45744f59ddbc3ec8df161217188,2,2.1.1,xiaomi,w,9,116.750866,36.568292,Redmi Note 7,1573423322908
3,test_4,67dd9dac18cce1a6d79e8f20eefd98ab,5678985365124870302,625dc45744f59ddbc3ec8df161217188,1,2.1.1,xiaomi,w,9,116.750866,36.568292,Redmi Note 7,1573423323068
4,test_5,67dd9dac18cce1a6d79e8f20eefd98ab,7231570878174745072,625dc45744f59ddbc3ec8df161217188,0,2.1.1,xiaomi,o,9,116.75088,36.5683,Redmi Note 7,1573423196460


In [None]:
test_data['deviceid'].unique().shape

In [97]:
test_data['newsid'].unique().shape

(626908,)

In [99]:
if not os.path.exists("{}/test_data.pickle".format(pickle_path)):
    t1 = time.time()
    test_data.to_pickle("{}/test_data.pickle".format(pickle_path))
    print('ACTIVE TO PICKLE: ',time.time()-t1)

ACTIVE TO PICKLE:  12.503127574920654


# train ｄａｔａ

In [6]:
'''
相比ｔｅｓｔ，多了两列特征target，timestamp
'''
train_data.head()

Unnamed: 0,id,target,timestamp,deviceid,newsid,guid,pos,app_version,device_vendor,netmodel,osversion,lng,lat,device_version,ts
0,1,0,,8b2d7f2aed47ab32e9c6ae4f5ae00147,8008333091915950969,9a2c909ebc47aec49d9c160cdb4a6572,1,2.1.5,HONOR,g4,9,112.5385,37.83793,STF-AL00,1573298086436
1,2,0,,8b2d7f2aed47ab32e9c6ae4f5ae00147,8008333091915950969,9a2c909ebc47aec49d9c160cdb4a6572,1,2.1.5,HONOR,w,9,111.7312,35.62274,STF-AL00,1573298087570
2,3,0,,832aaa33cdf4a0938ba2c795eb3ffefd,4941885624885390992,d51a157d2b1e0e9aed4dd7f9900b85b2,2,1.9.9,vivo,w,8.1.0,5e-324,5e-324,V1818T,1573377075934
3,4,0,,832aaa33cdf4a0938ba2c795eb3ffefd,6088376349846612406,d51a157d2b1e0e9aed4dd7f9900b85b2,1,1.9.9,vivo,w,8.1.0,5e-324,5e-324,V1818T,1573377044359
4,5,0,,67dd9dac18cce1a6d79e8f20eefd98ab,5343094189765291622,625dc45744f59ddbc3ec8df161217188,0,2.1.1,xiaomi,w,9,116.7509,36.56831,Redmi Note 7,1573380989662


In [100]:
train_data.shape

(11376681, 15)

In [101]:
train_data['deviceid'].unique().shape

(104736,)

In [102]:
train_data['newsid'].unique().shape

(1152851,)

In [103]:
train_data['guid'].unique().shape

(104333,)

In [106]:
if not os.path.exists("{}/train_data.pickle".format(pickle_path)):
    t1 = time.time()
    train_data.to_pickle("{}/train_data.pickle".format(pickle_path))
    print('ACTIVE TO PICKLE: ',time.time()-t1)

ACTIVE TO PICKLE:  31.221115827560425


# 处理user数据

In [57]:
'''
user.csv包含128573条数据，
9个字段：deviceid,guid,outertag,tag,
        level,personidentification,
        followscore,personalscore,
        gender 
deviceid：用户设备id。
guid：用户注册id。 
outertag：用户画像用|分隔，冒号后面的数字代表对该标签的符合程度，分数越高代表该标签越符合该用户。 
tag：同outertag。 
level：用户等级。 
personidentification：1表示劣质用户 0表示正常用户。 
followscore：徒弟分（好友分）。 
personalscore：个人分。 
gender：性别。 
'''
user_data.head(5)

Unnamed: 0,deviceid,guid,outertag,tag,level,personidentification,followscore,personalscore,gender
0,dd4f4cbcc9733f8de667a99b7f375b99,,,,,,,,
1,e9b1196a3fc0603c55614caf35c26ce5,,,天文_cs:7.456377740584219|地球_cs:6.98583086684807...,,,,,
2,db10a6fa198cb4d0aa7121642b8370b9,,穿秀_cs:1.2165785692152642|社会热点_cs:0.40078686267...,一起来看流星雨_cs:6.040606247799313|都市_cs:4.602140126...,,,,,
3,c52c040529eafef29b60c7b2b28cdf6f,,,,,,,,
4,bb08d2ff7f124e7fd6d00fd1a1a3eb36,,,,,,,,


In [58]:
user_data.isnull().any()

deviceid                False
guid                     True
outertag                 True
tag                      True
level                    True
personidentification     True
followscore              True
personalscore            True
gender                   True
dtype: bool

In [59]:
user_data['level'].unique()

array([nan,  1.,  2., 11., 13., 12., 21., 51., 23., 33., 22., 31., 41.,
       43., 53.])

In [60]:
user_data['level'] = user_data['level'].fillna(0)

In [61]:
user_data['followscore'].unique()

array([ nan, 10. ,  9.5, 12.3, 11.8, 10.9,  9.2, 11.6, 12.7, 12.5,  9.4,
       12.4, 10.8, 10.5, 13. , 12.2, 11.1, 11.3, 17.7,  9.3, 11.9, 11.2,
       12.8, 17.3, 11.5, 17.2, 10.1, 16.9, 16.3, 14.4, 11.4, 10.6, 12.1,
       14.3, 10.2, 11. , 12. , 19. , 10.4, 18. , 13.6, 15.8, 20.6, 10.7,
       11.7, 15.4,  9.1, 14.6, 14.8, 16.5, 10.3,  2.3, 15.9, 16.7, 14.2,
       15.5,  9.7, 15.7, 14.1, 14. , 17.6,  7.3, 12.9, 13.4,  5.6, 13.7,
        9. , 17.5, 17.1, 16.1, 18.1, 15.2, 15. , 19.5, 14.7, 13.3, 13.9,
       17.8,  9.6, 22.3, 12.6, 18.2,  8.3,  8.7, 17. , 16.4, 16.8,  7.6,
       21.5, 13.5,  8.2, 15.6, 16.6,  6.4, 19.1, 19.9,  4.6, 15.3,  9.9,
       16.2, 22.1, 20.4, 15.1, 20.9, 18.6,  3.5, 18.4, 19.7, 13.1, 14.9,
       14.5,  7.1, 20.5, 13.2,  8.9, 17.9,  4.3, 19.4, 18.3,  9.8,  8. ,
        7.8, 19.6, 22. , 22.8, 19.8,  4.7, 21.6,  3.8, 13.8, 21.4, 19.2,
        5.3, 16. ,  6.7,  7.9, 18.9,  7.4,  6. ,  7.5,  8.5, 18.7, 20.1,
       20.3, 20.2, 18.8,  7.7,  6.3, 21.1,  5.2, 17

In [62]:
user_data['followscore'] = user_data['followscore'].fillna(0)

In [63]:
user_data['personalscore'].unique()

array([ nan, 20.5, 10. , 18. ,  7.5,  8. , 19. , 13.5, 15. , 28. , 12. ,
       24.5, 22.5, 20. , 23.5, 16. , 11. , 14. , 14.5,  8.5,  4. ,  9. ,
       19.5, 23. , 25. ,  1.5, 21. , 26. ,  0. , 13. , -1.5, 18.5,  6. ,
       22. , 17. ,  3.5, 15.5, -1. , 12.5,  4.5, 26.5, 11.5, 24. ,  3. ,
        1. ,  2. , 17.5,  5. , 16.5, 27. ,  6.5,  9.5,  2.5, 21.5,  0.5,
       10.5, 25.5,  5.5, 27.5, -0.5,  7. , -2. , -2.5, -3. , -4. ])

In [64]:
user_data['guid'].isnull().sum() / user_data['guid'].shape[0]

0.3431902498969457

In [65]:
user_data['gender'].unique()

array([nan,  0.,  2.,  1.])

In [66]:
user_data['gender'].value_counts()

0.0    36427
2.0     9619
1.0     9514
Name: gender, dtype: int64

In [67]:
user_data['gender'].isnull().sum() / user_data['gender'].shape[0]

0.5678719482317438

In [68]:
user_data['gender'] = user_data['gender'].fillna(-1)

In [69]:
user_data.isnull().any()

deviceid                False
guid                     True
outertag                 True
tag                      True
level                   False
personidentification     True
followscore             False
personalscore            True
gender                  False
dtype: bool

In [107]:
if not os.path.exists("{}/user_data.pickle".format(pickle_path)):
    t1 = time.time()
    user_data.to_pickle("{}/user_data.pickle".format(pickle_path))
    print('ACTIVE TO PICKLE: ',time.time()-t1)

ACTIVE TO PICKLE:  3.95857310295105


In [4]:
tag_data = user_data[['deviceid','outertag','tag']]

In [5]:
tag_data['new_tag'] = tag_data['outertag'].apply(lambda x: str(x).split('|') if x is not np.nan else x)

In [6]:
tag_data['new_tag_2'] = tag_data['tag'].apply(lambda x: str(x).split('|') if x is not np.nan else x)

In [7]:
tag_data

Unnamed: 0,deviceid,outertag,tag,new_tag,new_tag_2
0,dd4f4cbcc9733f8de667a99b7f375b99,,,,
1,e9b1196a3fc0603c55614caf35c26ce5,,天文_cs:7.456377740584219|地球_cs:6.98583086684807...,,"[天文_cs:7.456377740584219, 地球_cs:6.985830866848..."
2,db10a6fa198cb4d0aa7121642b8370b9,穿秀_cs:1.2165785692152642|社会热点_cs:0.40078686267...,一起来看流星雨_cs:6.040606247799313|都市_cs:4.602140126...,"[穿秀_cs:1.2165785692152642, 社会热点_cs:0.400786862...","[一起来看流星雨_cs:6.040606247799313, 都市_cs:4.6021401..."
3,c52c040529eafef29b60c7b2b28cdf6f,,,,
4,bb08d2ff7f124e7fd6d00fd1a1a3eb36,,,,
...,...,...,...,...,...
128568,894d831564b80393dc17759fe5de6ac2,,内衣_cs:13.680843133280433|泳装_cs:12.111698872653...,,"[内衣_cs:13.680843133280433, 泳装_cs:12.1116988726..."
128569,7672e8b6b35fd86fdba6adc642a155fa,,,,
128570,c141551bb220b831a7e3a96afcc3da0e,萌娃_cs:0.4405755620303141|手工_cs:0.4367766119612...,女性时尚_cs:6.143540257568882|性感_cs:5.317837258451...,"[萌娃_cs:0.4405755620303141, 手工_cs:0.43677661196...","[女性时尚_cs:6.143540257568882, 性感_cs:5.3178372584..."
128571,575c7bc6ba73a6da9f50bd1098d28b51,,老虎_cs:54.30925671036804|美女_cs:5.5503764860383855,,"[老虎_cs:54.30925671036804, 美女_cs:5.550376486038..."


In [8]:
tag_data['all_tag'] = tag_data['new_tag'].astype(str) +','+ tag_data['new_tag_2'].astype(str) 
tag_data.replace('nan,nan',np.nan,inplace = True)
tag_data['all_tag'] = tag_data['all_tag'].map(lambda x:str(x).replace('nan,[','[').replace('],nan',']').replace('],[',', '))

In [9]:
tag_data.loc[2,'all_tag']

"['穿秀_cs:1.2165785692152642', '社会热点_cs:0.4007868626782047', '美妆_cs:0.3865336300642309', '美食_cs:0.26986132658795936', '一起来看流星雨_cs:6.040606247799313', '都市_cs:4.602140126675309', '张世_cs:3.9795760090872885', '食谱_cs:3.7462326434377453', '插翅难逃_cs:3.6745923528757576', '郑爽_cs:3.431694067701842', '情感_cs:3.247987477051344', '排骨_cs:3.0198338075184705', '撩妹_cs:2.342867174862778', '街采_cs:2.259804270985078', '云海_cs:2.0950671633453832', '学霸_cs:2.089995498740284', '张翰_cs:2.017445052672142', '免费_cs:2.010700610834794', '老照片_cs:1.9958837135628291', '隐婚_cs:1.942033782102077', '好戏_cs:1.9060189818210826', '绑架_cs:1.8848316244947665', '手段_cs:1.8639853540934332', '菜谱_cs:1.8448356310485905', '影视_cs:1.8175078682633468', '极品_cs:1.8069768667020591', '小保姆_cs:1.7767926127848894', '腹肌_cs:1.768053095576293', '黑老大_cs:1.697710765213656', '原来是这样_cs:1.688611985853882', '解酒_cs:1.6775192120459403', '早恋_cs:1.6667995589519407', '木瓜_cs:1.6168895816527447', '饭店_cs:1.5609737757491737', '老婆_cs:1.5280594734389132', '流星雨_cs:1.46536

In [10]:
tag_data['all_tag_word'] = tag_data['all_tag'].map(lambda x: re.sub("[A-Za-z0-9\!\%\[\]\,\。\_\:\.]", "", x))
tag_data['all_tag_word']= tag_data['all_tag_word'].replace("",'未知')
tag_data['all_tag_word'] = tag_data['all_tag_word'].map(lambda x:x.replace("'","").split(' '))

In [11]:
tag_data['all_tag_weight'] = tag_data['all_tag'].map(lambda x: re.findall("\d+\.?\d*", x))
tag_data['all_tag_weight']= tag_data['all_tag_weight'].map(lambda x: [0] if len(x) == 0 else x)

In [12]:
# tag_data['all_tag_weight']= tag_data['all_tag_weight'].replace("",0)

In [13]:
tag_data = tag_data[['deviceid','all_tag_word','all_tag_weight']]

In [14]:
tag_data.loc[21,'all_tag_word']

['娱乐资讯', '梅艳芳', '张学友', '豪车', '赵本山', '刘德华', '音乐', '明星', '港台娱乐']

In [15]:
tag_data.loc[21,'all_tag_weight']

['1.7289681671404824',
 '17.12037108613975',
 '16.844249659324227',
 '15.163125851055444',
 '11.76896689897077',
 '11.509515101601869',
 '6.006569478851135',
 '0.5520065602510347',
 '0.30967708613082306']

In [16]:
tag_data = tag_data.groupby(by='deviceid').sum() 

In [17]:
tag_data.reset_index(inplace = True)

In [18]:
tag_data.head()

Unnamed: 0,deviceid,all_tag_word,all_tag_weight
0,000046581b8a28c431be90c278674925,"[美食, --其他, 美食攻略, 花絮片段, 玩具, 吃秀, 社会热点, 中医, 片段, 大...","[0.4171913341996304, 0.36140167938226964, 0.35..."
1,00016381ab699d4e76dc99291e79e7a1,[未知],[0]
2,0001c7e6a85a3a4498fe0c5f29f3a379,"[社会热点, --其他, 古代, 范冰冰, 台湾, 李治廷, 彦希, 灰姑娘, 清朝, 总裁...","[0.8310844893612963, 0.3135020218516166, 6.367..."
3,000207c515d01c00e9144c6866b546a7,"[海军, 航母, 导弹, 武器, 武器, 导弹, 洲际导弹, 大妈, 海军, 航母, 网游,...","[17.15805189101101, 13.780793638746603, 13.220..."
4,000355d66e3fe127c8c2dd1ef60322a3,"[东北, 大盘, 菜谱]","[37.141856323864594, 35.747926949211916, 4.949..."


In [19]:
tag_data['all_tag_word'] = tag_data['all_tag_word'].map(lambda x:[i for i in x if i != ''])

In [20]:
tag_data['all_tag_weight'] = tag_data['all_tag_weight'].map(lambda x:[i for i in x if i != ''])

In [21]:
tag_data.shape

(114584, 3)

In [22]:
tag_data['deviceid'].unique().shape

(114584,)

In [23]:
tag_data[['deviceid','all_tag_word','all_tag_weight']]

Unnamed: 0,deviceid,all_tag_word,all_tag_weight
0,000046581b8a28c431be90c278674925,"[美食, --其他, 美食攻略, 花絮片段, 玩具, 吃秀, 社会热点, 中医, 片段, 大...","[0.4171913341996304, 0.36140167938226964, 0.35..."
1,00016381ab699d4e76dc99291e79e7a1,[未知],[0]
2,0001c7e6a85a3a4498fe0c5f29f3a379,"[社会热点, --其他, 古代, 范冰冰, 台湾, 李治廷, 彦希, 灰姑娘, 清朝, 总裁...","[0.8310844893612963, 0.3135020218516166, 6.367..."
3,000207c515d01c00e9144c6866b546a7,"[海军, 航母, 导弹, 武器, 武器, 导弹, 洲际导弹, 大妈, 海军, 航母, 网游,...","[17.15805189101101, 13.780793638746603, 13.220..."
4,000355d66e3fe127c8c2dd1ef60322a3,"[东北, 大盘, 菜谱]","[37.141856323864594, 35.747926949211916, 4.949..."
...,...,...,...
114579,fffe93df08d65513d293b7c376ca9349,"[未知, 未知]","[0, 0]"
114580,fffecf99aa7d5bdd2155c1f4093154d6,"[郑伊健, 陈小春, 香港, 演唱会, 音乐, 怀孕, 港台娱乐]","[30.232701285055935, 17.267250194945827, 12.15..."
114581,ffff31901148627b225fbb434e19ab06,"[萌宠, 花絮片段, 宠物狗, 蟒蛇, 龙珠, 赵刚, 自然地理, 巨蟒, 宠物, 中国历史...","[0.9336857225544823, 0.2993210383565363, 5.557..."
114582,ffffcb1db65dd1fdc4d09f8c3bbab2ea,[未知],[0]


In [24]:
if not os.path.exists("{}/tag_weight_data.pickle".format(pickle_path)):
    t1 = time.time()
    tag_data[['deviceid','all_tag_word','all_tag_weight']].to_pickle("{}/tag_weight_new_data.pickle".format(pickle_path))
    print('ACTIVE TO PICKLE: ',time.time()-t1)

ACTIVE TO PICKLE:  2.9672656059265137


In [27]:
def flatten_app(df):    
    u = []
    a = []
    for i in tqdm(range(len(df['deviceid'].values))):
        u += [df['deviceid'].values[i]]*df['tag_len'].values[i]
        a += list(df['all_tag_word'].values[i])
        
    new_df = pd.DataFrame()
    new_df['deviceid'] = u
    new_df['all_tag_word'] = a
        
    return new_df

In [25]:
tag_data['tag_len'] = tag_data['all_tag_word'].map(lambda x:len(x))

In [33]:
deal_tag_data = flatten_app(tag_data)

100%|██████████| 114584/114584 [00:00<00:00, 120284.51it/s]


In [34]:
deal_tag_data.shape

(4673627, 2)

In [35]:
deal_tag_data.drop_duplicates(['deviceid','all_tag_word'],inplace = True)

In [36]:
deal_tag_data.shape

(4321790, 2)

In [37]:
if not os.path.exists("{}/del_tag_data.pickle".format(pickle_path)):
    t1 = time.time()
    deal_tag_data.to_pickle("{}/deal_tag_new_data.pickle".format(pickle_path))
    print('ACTIVE TO PICKLE: ',time.time()-t1)

ACTIVE TO PICKLE:  1.6340346336364746
