# 课题实践一 计算加购次数最多的商品排行榜
## 加载原始数据并整理

In [1]:
import pandas as pd

events = pd.read_csv("./events.csv")
print(events.dtypes)

# 填充空值
events = events.fillna(0)

# 转换transactionid列为整型
events["transactionid"] = events["transactionid"].astype("int")
events["event"] = events["event"].astype("str")

# 增加一列，显示可读时间
events["action_time"] = pd.to_datetime(events["timestamp"], unit='ms')

# 展示开头的20个数据行
events.head(20)

timestamp          int64
visitorid          int64
event             object
itemid             int64
transactionid    float64
dtype: object


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,action_time
0,1433221332117,257597,view,355908,0,2015-06-02 05:02:12.117
1,1433224214164,992329,view,248676,0,2015-06-02 05:50:14.164
2,1433221999827,111016,view,318965,0,2015-06-02 05:13:19.827
3,1433221955914,483717,view,253185,0,2015-06-02 05:12:35.914
4,1433221337106,951259,view,367447,0,2015-06-02 05:02:17.106
5,1433224086234,972639,view,22556,0,2015-06-02 05:48:06.234
6,1433221923240,810725,view,443030,0,2015-06-02 05:12:03.240
7,1433223291897,794181,view,439202,0,2015-06-02 05:34:51.897
8,1433220899221,824915,view,428805,0,2015-06-02 04:54:59.221
9,1433221204592,339335,view,82389,0,2015-06-02 05:00:04.592


## 获取点击数据

In [7]:
import time

# 从日期字符串获取时间戳
def get_timestamp(dstr):
    timeArray = time.strptime(dstr, "%Y-%m-%d %H:%M:%S")
    timeStamp = int(time.mktime(timeArray)) * 1000
    return timeStamp
    
start_time_str = "2015-08-01 00:00:00"
start_timestamp = get_timestamp(start_time_str)
print(start_timestamp)

print(events.shape[0])

# 只保留有点击行为的记录，且取时间为2015-08-01 00:00:00以后的数据
events = events.loc[(events["event"] == "view") & (events["timestamp"] > start_timestamp)]
print(events.shape[0])

events.head(10)

1438358400000
2756101
837025


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,action_time
610621,1438494458916,1190029,view,338463,0,2015-08-02 05:47:38.916
610622,1438494443158,1140512,view,184998,0,2015-08-02 05:47:23.158
610623,1438493656705,1111168,view,456909,0,2015-08-02 05:34:16.705
610624,1438495148775,938839,view,73918,0,2015-08-02 05:59:08.775
610625,1438494854924,505655,view,409804,0,2015-08-02 05:54:14.924
610626,1438494833385,581285,view,133215,0,2015-08-02 05:53:53.385
610627,1438494457535,830739,view,82281,0,2015-08-02 05:47:37.535
610628,1438494944150,1084627,view,457045,0,2015-08-02 05:55:44.150
610629,1438495171613,206361,view,347025,0,2015-08-02 05:59:31.613
610630,1438494466246,581285,view,144607,0,2015-08-02 05:47:46.246


## 计算每个商品被浏览（点击）的次数

In [12]:
items = events.groupby("itemid").agg(count=("visitorid", "count")).reset_index()
items.head(20)

Unnamed: 0,itemid,count
0,3,2
1,4,2
2,6,6
3,16,4
4,19,12
5,22,1
6,25,13
7,26,2
8,29,5
9,32,20


## 按照被浏览（点击）次数排序

In [13]:
items = items.sort_values("count", ascending=False)
items.head(20)

Unnamed: 0,itemid,count
57617,187946,3365
141761,461686,1283
67309,219512,1046
58259,190000,866
98237,320130,844
117976,384302,752
133421,434782,697
20497,66752,595
139736,455183,566
29712,96924,559
