# 用户和活动关联关系处理


整个数据集中活动数目（events.csv）太多，所以下面的处理我们找出只在训练集和测试集中出现的活动和用户集合，并对他们重新编制索引

In [33]:
#保存数据
import pickle

import itertools

#处理事件字符串
import datetime
import pandas as pd
import numpy as np
import scipy.io as sio
import scipy.sparse as ss

#相似度/距离
import scipy.spatial.distance as ssd

from collections import defaultdict
from sklearn.preprocessing import normalize

In [1]:
 """
我们只关心train和test中出现的user和event，因此重点处理这部分关联数据

train.csv 有6列：
user：用户ID
event：活动ID
invited：是否被邀请（0/1）
timestamp：ISO-8601 UTC格式时间字符串，表示用户看到该活动的时间
interested, and not_interested

Test.csv 除了没有interested, and not_interested，其余列与train相同
 """
    
# 统计训练集中有多少不同的用户的events
uniqueUsers = set()
uniqueEvents = set()

#倒排表
#统计每个用户参加的活动   / 每个活动参加的用户
eventsForUser = defaultdict(set)
usersForEvent = defaultdict(set)
    
for filename in ["train.csv", "test.csv"]:
    f = open(filename, 'rb')
    
    #忽略第一行（列名字）
    f.readline().strip().split(b",")
    
    for line in f:    #对每条记录
        cols = line.strip().split(b",")
        uniqueUsers.add(cols[0])   #第一列为用户ID
        uniqueEvents.add(cols[1])   #第二列为活动ID
        
        #eventsForUser[cols[0]].add(cols[1])    #该用户参加了这个活动
        #usersForEvent[cols[1]].add(cols[0])    #该活动被用户参加
    f.close()


n_uniqueUsers = len(uniqueUsers)
n_uniqueEvents = len(uniqueEvents)

print("number of uniqueUsers :%d" % n_uniqueUsers)
print("number of uniqueEvents :%d" % n_uniqueEvents)
print(n_uniqueUsers)
#用户关系矩阵表，可用于后续LFM/SVD++处理的输入
#这是一个稀疏矩阵，记录用户对活动感兴趣
userEventScores = ss.dok_matrix((n_uniqueUsers, n_uniqueEvents))
userIndex = dict()
eventIndex = dict()

#重新编码用户索引字典
for i, u in enumerate(uniqueUsers):
    userIndex[u] = i
    
#重新编码活动索引字典    
for i, e in enumerate(uniqueEvents):
    eventIndex[e] = i

n_records = 0
ftrain = open("train.csv", 'rb')
ftrain.readline()
for line in ftrain:
    cols = line.strip().split(b",")
    i = userIndex[cols[0]]  #用户
    j = eventIndex[cols[1]] #活动
    
    eventsForUser[i].add(j)    #该用户参加了这个活动
    usersForEvent[j].add(i)    #该活动被用户参加
        
    #userEventScores[i, j] = int(cols[4]) - int(cols[5])   #interested - not_interested
    score = int(cols[4])
    #if score == 0:  #0在稀疏矩阵中表示该元素不存在，因此借用-1表示interested=0
    #userEventScores[i, j] = -1
    #else:
    userEventScores[i, j] = score
ftrain.close()

  
##统计每个用户参加的活动，后续用于将用户朋友参加的活动影响到用户
pickle.dump(eventsForUser, open("PE_eventsForUser.pkl", 'wb'))
##统计活动参加的用户
pickle.dump(usersForEvent, open("PE_usersForEvent.pkl", 'wb'))

#保存用户-活动关系矩阵R，以备后用
sio.mmwrite("PE_userEventScores", userEventScores)


#保存用户索引表
pickle.dump(userIndex, open("PE_userIndex.pkl", 'wb'))
#保存活动索引表
pickle.dump(eventIndex, open("PE_eventIndex.pkl", 'wb'))

    
# 为了防止不必要的计算，我们找出来所有关联的用户 或者 关联的event
# 所谓的关联用户，指的是至少在同一个event上有行为的用户pair
# 关联的event指的是至少同一个user有行为的event pair
uniqueUserPairs = set()
uniqueEventPairs = set()
for event in uniqueEvents:
    i = eventIndex[event]
    users = usersForEvent[i]
    if len(users) > 2:
        uniqueUserPairs.update(itertools.combinations(users, 2))
        
for user in uniqueUsers:
    u = userIndex[user]
    events = eventsForUser[u]
    if len(events) > 2:
        uniqueEventPairs.update(itertools.combinations(events, 2))
 
#保存用户-事件关系对索引表
pickle.dump(uniqueUserPairs, open("FE_uniqueUserPairs.pkl", 'wb'))
pickle.dump(uniqueEventPairs, open("PE_uniqueEventPairs.pkl", 'wb'))

NameError: name 'defaultdict' is not defined

In [13]:
#训练集和测试集中出现的用户数目和事件数目远小于users.csv出现的用户数和events.csv出现的事件数

In [41]:
uniqueEvents = list(uniqueEvents)
uniqueEvents1 = []
for i in uniqueEvents:
    i = int(bytes.decode(i))
    uniqueEvents1.append(i)
uniqueEvents1

[1379185675,
 3189486214,
 4090120152,
 899668139,
 1077498346,
 1089038689,
 2969611776,
 2843648870,
 760152605,
 692075109,
 1579687184,
 1033318499,
 1419057761,
 3712819742,
 622120837,
 1399415999,
 3401681420,
 2732810444,
 2760876250,
 3254851447,
 1660678384,
 1971855649,
 3637523569,
 477813398,
 21534034,
 1338116650,
 3100093322,
 3509870432,
 573886273,
 3950286482,
 1294336672,
 2043455253,
 4223429256,
 1295729468,
 625259568,
 3077019139,
 1454534917,
 3687555234,
 2702494308,
 4220107080,
 3392552803,
 3599006720,
 2879361619,
 1933542442,
 2114371180,
 414015310,
 4228109405,
 4040372452,
 3064874467,
 3643976476,
 1486542415,
 2193505055,
 3039652825,
 1838041662,
 141146193,
 414842182,
 2953583082,
 1521445358,
 3701547443,
 960296146,
 1095829668,
 1997149065,
 201564442,
 3199154012,
 3382385081,
 2190369261,
 1330332438,
 2925109008,
 47443771,
 2534465304,
 334681152,
 1470185288,
 4055585686,
 2429693803,
 3733069609,
 2107263424,
 721858430,
 20381932,
 14665

In [43]:
events_data = pd.read_csv('events.csv')
events_data

Unnamed: 0,event_id,user_id,start_time,city,state,zip,country,lat,lng,c_1,...,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other
0,684921758,3647864012,2012-10-31T00:00:00.001Z,,,,,,,2,...,0,1,0,0,0,0,0,0,0,9
1,244999119,3476440521,2012-11-03T00:00:00.001Z,,,,,,,2,...,0,0,0,0,0,0,0,0,0,7
2,3928440935,517514445,2012-11-05T00:00:00.001Z,,,,,,,0,...,0,0,0,0,0,0,0,0,0,12
3,2582345152,781585781,2012-10-30T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,8
4,1051165850,1016098580,2012-09-27T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,9
5,1212611096,1426522332,2012-11-16T00:00:00.001Z,,,,,,,0,...,0,0,0,0,0,0,0,0,0,22
6,3689283674,725266702,2012-11-02T20:00:00.003Z,,,,,,,0,...,0,0,0,0,0,0,0,0,0,28
7,2584113432,613687941,2012-10-31T00:00:00.001Z,,,,,,,0,...,2,0,0,0,0,0,0,0,0,354
8,3365728297,1098509207,2012-10-31T00:00:00.001Z,,,,,47.058,21.926,0,...,0,0,0,0,0,0,0,1,0,25
9,2912638473,3598071768,2012-10-18T00:00:00.001Z,,,,,,,1,...,0,0,0,0,0,0,0,0,0,3


In [130]:
columns = events_data.columns
events = pd.DataFrame(columns=columns)
cnt=0
for i in uniqueEvents1:
    events = events.append(events_data.loc[events_data.event_id==i],ignore_index=True)



In [131]:
print(events)

         event_id     user_id                start_time            city  \
0      1379185675   947978913  2012-09-13T02:00:00.003Z         Toronto   
1      3189486214  1176436376  2012-10-28T20:00:00.003Z             NaN   
2      4090120152  3416671649  2012-10-13T05:00:00.003Z      Long Beach   
3       899668139  1558385303  2012-11-18T04:00:00.003Z             NaN   
4      1077498346  3631515235  2012-11-12T00:00:00.001Z             NaN   
5      1089038689  2471762109  2012-09-26T00:00:00.001Z           Medan   
6      2969611776  1626472083  2012-10-04T12:00:00.003Z           Nilai   
7      2843648870  2577978364  2012-11-05T03:00:00.003Z      Long Beach   
8       760152605   415464198  2012-08-01T02:00:00.000Z         Toronto   
9       692075109    78587940  2012-10-28T01:00:00.003Z             NaN   
10     1579687184  3352366411  2012-09-28T02:00:00.003Z        Bellevue   
11     1033318499   137154249  2012-10-05T18:00:00.003Z         Ferizaj   
12     1419057761  366149

In [132]:
events.to_csv('events2.csv',encoding="gbk")