In [70]:
import re
import time
import datetime
import pymongo
from tqdm import tqdm
import copy
from loguru import logger

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

from selenium.common.exceptions import NoSuchElementException,StaleElementReferenceException


class Tweet:

    def __init__(self, query, uid, ptime, pcontent, padditional, nb_reply, nb_retweet, nb_favorite):
        self.query = query
        self.uid = uid
        self.ptime = ptime
        self.pcontent = pcontent
        self.padditional = padditional  # 转发推文，文章链接，图片，视频
        self.nb_retweet = nb_retweet  # nbr of retweet
        self.nb_favorite = nb_favorite  # nbr of favorite
        self.nb_reply = nb_reply    # nbr of reply

    def __repr__(self):
        return "Tweet={}\nQuery={}".format(self.pcontent, self.query)


class User:

    def __init__(self, profile_url):
        self.profile_url = profile_url
        self.ID = profile_url.split('/')[-1]
        self.name = ''
        self.avatar = ''

    def __repr__(self):
        return "User {}".format(self.ID)


def convert_time(x):
    '''
    for x in ['20分钟','1小时','1天', '10月10日','2018年10月1日']:
        print(convert_time(x))
    '''
    now = datetime.datetime.now()
    pattern = r'\d{4}年\d+月\d+日'
    if re.match(pattern, x):
        return x
    pattern = r'\d+月\d+日'
    if re.match(pattern, x):
        return "{}年".format(now.year)+x
    return "{}年{}月{}日".format(now.year, now.month, now.day)


def is_non_result(browser):
    '''
    判断结果是否为空
    '''
    result_div_xpath = "//div[@id='react-root']"
    wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
    try:
        result_div = browser.find_element_by_xpath(result_div_xpath)
        return '没有符合搜索条件的结果' in result_div.text
    except NoSuchElementException as e:
        return False


def get_search_input_v1(browser):
    # 定位搜索框
    search_input_xpath = "//input[@placeholder='搜索 Twitter']"
    wait.until(EC.presence_of_element_located((By.XPATH, search_input_xpath)))
    search_input = browser.find_element_by_xpath(search_input_xpath)
    return search_input


def get_search_input_v2(browser):
    # 请求主站
    browser.get('https://twitter.com/search-home')
    # 定位搜索框
    search_input_id = 'search-home-input'
    wait.until(EC.presence_of_element_located((By.ID, search_input_id)))
    search_input = browser.find_element_by_id(search_input_id)
    return search_input


def extract_reply_retweet_favorite(element):
    t = []
    for x in element.find_elements_by_xpath('./div')[:3]:
        if x.text.strip() == '':
            t.append(0)
        else:
            t.append(int(x.text.strip()))
    return tuple(t)


def parse_result_div(result_div):
    count = 0
    for div in result_div:
        user, tweet = div.find_elements_by_xpath('./div')
        profile_url = user.find_element_by_tag_name(
            'a').get_attribute('href').strip()
        uid = profile_url.split('/')[-1]
#         print('pasing uid={}'.format(uid))
        a, *b_c, d = tweet.find_elements_by_xpath('./div')  # 按照div分为>=3层
        ptime = a.find_elements_by_tag_name('a')[-1].text
        ptime = convert_time(ptime)
        nb_reply, nb_retweet, nb_favorite = 0,0,0
        try:
            nb_reply, nb_retweet, nb_favorite = extract_reply_retweet_favorite(
                d)
        except:
            nb_reply, nb_retweet, nb_favorite = 0, 0, 0
        pcontent = b_c[0].text
        padditional = []
        if len(b_c) > 1:
            for x in b_c[1:]:
                try:
                    a = x.find_element_by_tag_name('a').get_attribute('href')
                    padditional.append(a)
                except NoSuchElementException as e:
                    padditional.append(x.text.strip())
        user = User(profile_url)
        tweet = Tweet(query, uid, ptime, pcontent, padditional,
                      nb_reply, nb_retweet, nb_favorite)
        # save to databse
        if user_table.insert_one(user.__dict__) and tweet_table.insert_one(tweet.__dict__):
            count += 1
    return count


def crawl(browser, query):
    count = 0
    result_div_xpath = '//div[@data-testid="tweet"]'
    wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
    result_div = browser.find_elements_by_xpath(result_div_xpath)
    last_div = result_div[-1]
    # 解析结果
    count += parse_result_div(result_div)
    while count < MAX_SIZE:
        logger.info("{}/{}".format(count,MAX_SIZE))
        result_div_xpath = '//div[@data-testid="tweet"]'
        wait.until(EC.presence_of_element_located((By.XPATH, result_div_xpath)))
        result_div = browser.find_elements_by_xpath(result_div_xpath)
        last_div = result_div[-1]
        try:
            count += parse_result_div(result_div)
        except StaleElementReferenceException as e:
            time.sleep(2)
            continue
        
        # 翻页
        while True:
            browser.execute_script(
                'window.scrollTo(0,document.body.scrollHeight)')
            wait.until(EC.presence_of_element_located(
                (By.XPATH, result_div_xpath)))
            result_div = browser.find_elements_by_xpath(result_div_xpath)
            if result_div[-1] != last_div:
                last_div = result_div[-1]
                break

In [12]:
client = pymongo.MongoClient("mongodb://10.108.17.25:27017/")
twitter_db = client["twitter"]
user_table = twitter_db['user']
tweet_table = twitter_db['tweet']
MAX_SIZE = 50

# 打开浏览器
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 100)

# 人工登录
browser.get('https://twitter.com/')

In [None]:
# time.sleep(60)

In [71]:
bad_query_list = []
query_list = ['the belt and road']
browser.refresh()
time.sleep(2)

for query in tqdm(query_list):
    logger.info('query = {}'.format(query))
    browser.get('https://twitter.com/explore')

    # 定位搜索框
    if browser.current_url == 'https://twitter.com/explore':
        search_input = get_search_input_v1(browser)
    else:
        search_input = get_search_input_v2(browser)
    # 搜索query
    search_input.clear()
    search_input.send_keys(query)
    search_input.send_keys(Keys.ENTER)

    # 获取结果
    if is_non_result(browser):
        bad_query_list.append(query)
        continue
    time.sleep(1)
    crawl(browser, query)






  0%|          | 0/1 [00:00<?, ?it/s][A[A[A[A[A2019-12-15 21:56:07.117 | INFO     | __main__:<module>:7 - query = the belt and road
2019-12-15 21:56:11.655 | INFO     | __main__:crawl:146 - 10/50
2019-12-15 21:56:13.571 | INFO     | __main__:crawl:146 - 20/50
2019-12-15 21:56:15.190 | INFO     | __main__:crawl:146 - 27/50
2019-12-15 21:56:17.691 | INFO     | __main__:crawl:146 - 27/50
2019-12-15 21:56:21.419 | INFO     | __main__:crawl:146 - 48/50
2019-12-15 21:56:23.922 | INFO     | __main__:crawl:146 - 48/50





100%|██████████| 1/1 [00:19<00:00, 19.94s/it][A[A[A[A[A
