# 基于Taptap平台：中国地区手机游戏数据分析报告

## 功能描述：


### 输入：taptap平台url链接 爬取http://www.taptap.com（全站上架游戏）

### 提取：网页关键信息
#### 游戏名称(name), 游戏种类(catagory),游戏id(id)
#### 游戏评分（IOS平台评分ios_score,Android平台评分android_score,总评分total_score),
#### 安装人数(download_times),厂商(publisher),关联关键词(related_keyword),
#### 评论次数(review),论坛话题次数(topic),支持语言(languages),
#### 用户评论(comment),用户评论时间(comment_dt),
#### 话题标题(topic_title),话题发布时间(topic_dt), 
#### 支持安卓下载(support_android),支持ios下载(support_ios), 
#### 游戏大小(size),游戏最新更新时间(lasted_update)

### 输出：基础数据表，用户评价表， 用户话题表 并存储进数据库中
#### 各表包含数据主要字段不同
#### 基础数据表：游戏名称(name), 游戏种类(catagory),游戏id(id),游戏评分（IOS平台评分ios_score,Android平台评分android_score,总评分total_score),安装人数(download_times),厂商(publisher),关联关键词(related_keyword),评论次数(review),论坛话题次数(topic),支持语言(languages),支持安卓下载(support_android),支持ios下载(support_ios), 
#### 游戏评价表: 游戏id(id),游戏名称(name)游戏类别(catagory),用户id(user_id),评价时间(review_dt), 评价内容(review_content),使用手机(user_phone)
#### 游戏话题表: 游戏id(id), 游戏名称(name),游戏类别(catagory),用户id(user_id),话题标题(topic_title),话题发布时间(topic_dt)

### 数据分析：数据清洗 数据可视化

### 基本技术路线：requests-re-BeautifulSoup-numpy-pandas-matplotlib

## 程序结构设计

### 步骤一 从url上获取网页内容  def gethtml()
### 步骤二 提取网页内容中信息到合适的数据结构  def gamedata()
### 步骤三 数据清洗 数据分析 数据可视化

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

'A mobile game data analysis report'

__author__ = 'Troy'

import time
import requests
import re
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime
import sys
import importlib
importlib.reload(sys)

%matplotlib inline

In [85]:
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    'Host': 'www.taptap.com',
    'User-Agent': 'Mozilla/5.0'
}
proxies = {'http' : 'http://122.193.14.102:80'}
    
def gethtml(url):
    time.sleep(np.random.randint())
    try:
        r = requests.get(url, headers=headers, proxies=proxies)
        r.raise_for_status()
        r.encoding = u'utf-8'
        #print(r.request.headers)
        return r.text
    except:
        return None

def getbasedata(html):
    if html:
        result = {}
        soup = BeautifulSoup(html, 'html.parser')
        link = soup.select('link')[0]['href']
    # 获取关键字段信息
        result['id'] = link.split('/')[-1]
        result['name'] = soup.select('h1[itemprop="name"]')[0].string
        result['category'] = soup.select('ol.breadcrumb')[0].contents[5].text.strip()
        try:
            result['total_score'] = float(soup.select('span[itemprop="ratingValue"]')[0].string)
        except:
            result['total_score'] = None
        try:
            result['android_score'] = float(re.sub('Android：', '', soup.select('div > span')[5].string))
        except:
            result['android_score'] = None
        try:
            result['ios_score'] = float(re.sub('iOS：', '', soup.select('div > span')[6].string))
        except:
            result['ios_score'] = None
        try:
            result['download_times'] = int(re.sub('人安装', '', soup.select('span.text-download-times')[0].string))
        except:
            result['download_times'] = '未上架'
        try:
            result['concerned_number'] = int(re.sub('人关注', '', soup.select('span.text-download-times')[0].string))
        except:
            result['concerned_number'] = None
        try:
            result['publisher'] = soup.select('a.info-item-content.link')[0].string
        except:
            result['publisher'] = None
        try:
            result['review'] = int(soup.select('li > a > small')[0].string)
        except:
            result['review'] = None
        try:
            result['topic'] = int(soup.select('li > a > small')[1].string)
        except:
            result['topic'] = None
        try:
            result['related_keyword'] = ','.join(re.findall(r'[\u4e00-\u9fa5]+', soup.select('ul#appTag')[0].text))
        except:
            result['related_keyword'] = None
        try:
            result['language'] = ','.join(re.findall(r'[\u4e00-\u9fa5]+', soup.select('ul[class="list-unstyled main-body-additional"]')[0].text))
        except:
            result['language'] = None
        try:
            result['support_android'] = soup.select('button.btn.btn-primary.btn-lg.android')[0].text.strip()
        except:
            result['support_android'] = None
        try:
            result['support_ios'] = soup.select('a.btn.btn-primary.ios')[0].text.strip() 
        except:
            result['support_ios'] = None
        try:
            result['introduction'] = soup.select('div#description')[0].text.strip()
        except:
            result['introduction'] = None
        return result
    
def getreview()

def storingdata():
    data = []
    urls = ['https://www.taptap.com/app/{}'.format(i) for i in range(80000)]
    for url in urls:
        html = gethtml(url)
        if html:
            data.append(getbasedata(html))
    df =  pd.DataFrame(data)
    return df

if __name__ == '__main__':
    df = storingdata()
    with sqlite3.connect(r'd:\SQL\sqlite\taptap\data.sqlite') as db:
    df.to_sql('basedata', con=db, if_exists='append')

Unnamed: 0,android_score,category,concerned_number,download_times,id,introduction,ios_score,language,name,publisher,related_keyword,review,support_android,support_ios,topic,total_score
0,,模拟,37,未上架,1899,대한민국 원조 해전게임! 전함제국15차 대규모 업데이트로 돌아왔다!! 제국의 귀환!...,,,战舰帝国 : 钢铁舰队,Gamepub,"模拟,动作",1,,下载,,
1,,休闲,43,未上架,1900,★★★ 두근두근 섬마을 라이프 ★★★▶ 내 취향대로 꾸미는 작은 섬- 나만의 섬을 ...,,,海盗岛Kakao,"FLERO GAMES Co.,Ltd.",休闲,2,,,,


In [None]:
r = requests.get('https://www.taptap.com/app/1')
r.status_code
r.encoding = r'utf-8'
r.headers
r.request.headers

# 获取游戏id
soup = BeautifulSoup(r.text, 'html.parser')
link = soup.select('link')[0]['href']
game_id = link.split('/')[-1]
# 获取游戏name
soup.select('h1[itemprop="name"]')[0].string
# 获取游戏种类
soup.select('ol.breadcrumb')[0].contents[5].text.strip()
# 总评分
float(soup.select('span[itemprop="ratingValue"]')[0].string)
# android score 安卓评分
android_score = float(re.sub('Android：', '', soup.select('div > span')[5].string))
android_score
# ios评分
soup.select('div > span')[6].string
ios_score = float(re.sub('iOS：', '', soup.select('div > span')[6].string))
#ios_score
# 下载次数
print(int(re.sub('人安装|人关注', '',soup.select('span.text-download-times')[0].string)))
# 厂商
soup.select('span[itemprop="name"]')[0].string
# 评论次数
int(soup.select('li > a > small')[0].string)
# 话题次数
#int(soup.select('li > a > small')[1].string)
# 关联关键词
re.findall(r'[\u4e00-\u9fa5]+', soup.select('ul#appTag')[0].text)
# 支持语言    
re.findall(r'[\u4e00-\u9fa5]+', soup.select('ul[class="list-unstyled main-body-additional"]')[0].text)
# 支持安卓下载
soup.select('div[data-type="android"]')[0].text.strip()
# 支持ios下载
soup.select('div[data-type="ios"]')[0].text.strip()

soup.select('a.info-item-content.link')[0].string

In [None]:
','.join(re.findall(r'[\u4e00-\u9fa5]+', soup.select('ul[class="list-unstyled main-body-additional"]')[0].text))

In [None]:
','.join(re.findall(r'[\u4e00-\u9fa5]+', soup.select('ul#appTag')[0].text))

In [None]:
# 简介
soup.select('div#description')[0].text.strip()

In [None]:
# 随机数
np.random.randint(low=1,high=10, size=10)

In [2]:
# view 评论数
review_url = 'https://www.taptap.com/app/1/review?order=default&page=1#review-list'
r = requests.get(review_url)
r.status_code
r.encoding = r'utf-8'
soup = BeautifulSoup(r.text, 'html.parser')

In [3]:
# id 
soup.select('div.taptap-button-download')[0]['data-app-id']

'1'

In [4]:
# name
soup.select('h1[itemprop="name"]')[0].string

'钢琴块2 （别踩白块儿2 ）'

In [5]:
# 类别
soup.select('ol.breadcrumb')[0].contents[5].text.strip()

'休闲'

In [21]:
# 用户id
soup.select('div.item-text-header')[0].contents[1].text.strip()
for i in range(len(soup.select('div.item-text-header'))):
    print(soup.select('div.item-text-header')[i].contents[1].text.strip())

len(soup.select('div.item-text-header'))  

我要喝芽衣的妹汁
果汁盒
总攻大人
AliceHe_AH
柠檬味的猫º
三倍速蜗牛
一身正气过冬
手机用户16402894
風
田中
北风呼啸
忆
失去梦想的二哈
手机用户18030672
手机用户17890554
埃罗阿芒老师
The Devil＇s tea
萌え萌えしん
昔往今来.
N.W 长生诀。
NICEPLAYER


21

In [29]:
# 评论时间
# 使用正则表达式进行匹配时间
re.findall(r'\d{4}-\d{2}-\d{2}',soup.text)[1]
datetime.strptime(re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',soup.text)[0], '%Y-%m-%d %H:%M:%S')

datetime.datetime(2018, 1, 14, 9, 11, 14)

09:11:14
1900-01-01 09:11:14


In [8]:
# 评价内容
soup.select('div.item-text-body')[0].text.strip()

'我非常喜欢钢琴块，自从这个游戏刚刚出世到现在它都一直在我手机里。也有过别人跟我开过玩笑说 每次看你手机都有新老游戏的交替可为什么这个从没见你卸载过?我笑着说 不是不卸只是喜欢。没错，我很喜欢钢琴块这类游戏，也可以说是钢琴块才让我喜欢上音游。钢琴块画面简洁 易懂 音乐优美 旋律动听 真的是一款很好的游戏。'

In [9]:
# 设备
soup.select('span.text-footer-device')

[<span class="text-footer-device">OPPO R9s</span>,
 <span class="text-footer-device">三星 Galaxy Note 3</span>,
 <span class="text-footer-device">华为P9</span>,
 <span class="text-footer-device">华为 荣耀4A</span>,
 <span class="text-footer-device">vivo Y51A</span>,
 <span class="text-footer-device">魅蓝Note 5</span>,
 <span class="text-footer-device">Xiaomi Redmi 5 Plus</span>,
 <span class="text-footer-device">Xiaomi MI 5X</span>,
 <span class="text-footer-device">HUAWEI HUAWEI TAG-CL00</span>,
 <span class="text-footer-device">华为P8</span>,
 <span class="text-footer-device">vivo V3Max</span>,
 <span class="text-footer-device">vivo vivo Y66i</span>,
 <span class="text-footer-device">MBI i7 MBI i7</span>,
 <span class="text-footer-device">华为Mate 8</span>,
 <span class="text-footer-device">乐2</span>,
 <span class="text-footer-device">华为 荣耀畅玩4C</span>,
 <span class="text-footer-device">华为 荣耀畅玩5A</span>]

In [10]:
# 页面数 有些只有一夜不存在翻页
#re.findall(r'\d*', soup.select('section.taptap-button-more')[0])
int(soup.select('ul.pagination')[-1].contents[-2].text)

129

In [11]:
# 评论数
int(soup.select('li > a > small')[0].string)

2564

## 抓取评论数构架
### https://www.taptap.com/app/{}/review?order=default&page=1#review-list  查看页面是否存在 存在进行下一步

In [42]:
# 第一步gethtml不变
def gethtml(url):
   # time.sleep(0.5)
    try:
        r = requests.get(url, headers=headers, proxies=proxies)
        r.raise_for_status()
        r.encoding = u'utf-8'
        #print(r.request.headers)
        return r.text
    except:
        return None
    
# 第二步判断html是否存在并抓取数据
def getreview():
    urls = ['https://www.taptap.com/app/{}/review?order=default&page=1#review-list'.format(m) for m in range(800000)]
    for url in urls:
        text = gethtml(url)
        if text:
            result_1 = {}
            soup = BeautifulSoup(text, 'html.parser')

            result_1['id'] = soup.select('div.taptap-button-download')[0]['data-app-id']
            result_1['name'] = soup.select('h1[itemprop="name"]')[0].string
            result_1['category'] = soup.select('ol.breadcrumb')[0].contents[5].text.strip()
            try:
                result_1['review'] = int(soup.select('li > a > small')[0].string)# 评论数
            except:
                result_1['review'] = 0
            for i in range(len(soup.select('div.item-text-header'))):
                result_1['user_id'] = soup.select('div.item-text-header')[i].contents[1].text.strip()
                result_1['datetime'] = datetime.strptime(re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',soup.text)[0], '%Y-%m-%d %H:%M:%S')
                result_1['content'] = soup.select('div.item-text-body')[i].text.strip()
            return result_1

data = getreview()
print(data)
          

            
            
# 递归怎么写

None


In [62]:
user_url = 'https://www.taptap.com/auth/email/login'
data = {
    'email' : '117368261@qq.com',
    'password' : 'HCH520'
}
r = requests.post(user_url, data=data)
r.status_code
r.text

'<!DOCTYPE html>\n<html>\n    <head>\n    <meta charset="utf-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge">\n    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" />\n\n    <title>TapTap | 发现好游戏</title>\n    <meta name="keywords" content="TapTap,TapTap官方网站,手机游戏社区,正版手游下载,好玩的手游,Google Play游戏排行榜,iOS游戏排行榜,安卓免费游戏,安卓游戏排行榜">\n    <meta name="description" content="TapTap是一个推荐高品质手游的手游分享社区，实时同步全球各大应用市场游戏排行榜，与全球玩家共同交流并发掘高品质手游。每一款推荐游戏，都是由专业的测评团队从全球海量的游戏中精选而出，只为你提供好玩的手机游戏。">\n                <meta name="csrf-token" content="ZZ69AfrT91ilxe8EHRJw9QHtW75oJSP343TfWV9C">\n    <meta name="cookie-domain" content="www.taptap.com">\n                <meta property="og:site_name" content="TapTap"/><meta property="og:type" content="website"/><meta property="fb:app_id" content="170292620032143"/><meta property="title" content="TapTap | 发现好游戏"/><meta property="description" content="TapTap是一个推荐高品质手游的手游分享社区，实时同步全球各大应用市场游戏排行榜，与全球玩家共同交流并发掘高品质手游

In [67]:
for url in ['https://www.taptap.com/user/{}/reviews'.format(i) for i in range(1, 3)]:
    for m in [url+'?page={}'.format(n) for n in range(1, 3)]:
        print(m)

https://www.taptap.com/user/1/reviews?page=1
https://www.taptap.com/user/1/reviews?page=2
https://www.taptap.com/user/2/reviews?page=1
https://www.taptap.com/user/2/reviews?page=2
