diff --git a/db/models.py b/db/models.py index d1e225cf..61b5e70e 100644 --- a/db/models.py +++ b/db/models.py @@ -58,7 +58,7 @@ def __init__(self, uid, other_id, type): self.type = type -class WeiboDialoggue(Base): +class WeiboDialogue(Base): __table__ = weibo_dialogue def __repr__(self): diff --git a/db/tables.py b/db/tables.py index b3b46295..fe327955 100644 --- a/db/tables.py +++ b/db/tables.py @@ -116,6 +116,7 @@ Column("dialogue_id", String(50), unique=True), Column("weibo_id", String(200)), Column("dialogue_cont", Text), + Column("dialogue_rounds", INTEGER), ) __all__ = ['login_info', 'wbuser', 'seed_ids', 'keywords', 'weibo_data', 'keywords_wbdata', 'weibo_comment', diff --git a/page_get/__init__.py b/page_get/__init__.py index 638909b6..108a9b20 100755 --- a/page_get/__init__.py +++ b/page_get/__init__.py @@ -1,4 +1,4 @@ from .basic import get_page from .status import get_cont_of_weibo from .user import ( - get_profile, get_fans_or_followers_ids) \ No newline at end of file + get_profile, get_fans_or_followers_ids, get_user_profile) diff --git a/page_get/user.py b/page_get/user.py index e8093799..0ceec634 100755 --- a/page_get/user.py +++ b/page_get/user.py @@ -7,7 +7,6 @@ from page_parse.user import ( enterprise, person, public) - BASE_URL = 'http://weibo.com/p/{}{}/info?mod=pedit_more' @@ -102,6 +101,20 @@ def get_profile(user_id): return user, is_crawled +def get_user_profile(user_id): + """ + :param user_id: uid + :return: user info and is crawled or not + """ + user = UserOper.get_user_by_uid(user_id) + + if user: + storage.info('user {id} has already crawled'.format(id=user_id)) + else: + user = get_url_from_web(user_id) + return user + + def get_fans_or_followers_ids(user_id, crawl_type): """ Get followers or fans @@ -133,4 +146,3 @@ def get_fans_or_followers_ids(user_id, crawl_type): cur_page += 1 return user_ids - diff --git a/page_parse/dialogue.py b/page_parse/dialogue.py index fc8ce93f..79d9068a 100644 --- a/page_parse/dialogue.py +++ b/page_parse/dialogue.py @@ -3,7 +3,7 @@ from bs4 import BeautifulSoup from logger import parser -from db.models import WeiboDialoggue +from db.models import WeiboDialogue from decorators import parse_decorator from .comment import get_html_cont @@ -44,20 +44,21 @@ def get_dialogue(html, wb_id, cid): """ cont = get_html_cont(html) soup = BeautifulSoup(cont, 'lxml') - - # print(soup.prettify()) dialogue_list = [] dialogues = soup.find_all(attrs={'class': 'WB_text'}) if len(dialogues) < 2: - return None - weibo_dialogue = WeiboDialoggue() + return None, None + weibo_dialogue = WeiboDialogue() + uids = [] try: for dialogue in dialogues: - # print(dialogue.text.strip()) - dialogue_list.append(dialogue.text.strip()) + user_id = dialogue.find('a').get('usercard')[3:] + uids.append(user_id) + dialogue_list.append({'uid': user_id, 'text': dialogue.text.strip()}) weibo_dialogue.weibo_id = wb_id weibo_dialogue.dialogue_id = cid weibo_dialogue.dialogue_cont = json.dumps(dialogue_list) + weibo_dialogue.dialogue_rounds = len(dialogues) except Exception as e: parser.error('解析对话失败,具体信息是{}'.format(e)) - return weibo_dialogue + return weibo_dialogue, uids diff --git a/tasks/comment.py b/tasks/comment.py index 77204376..d4567ebe 100644 --- a/tasks/comment.py +++ b/tasks/comment.py @@ -6,7 +6,7 @@ WbDataOper, CommentOper) -BASE_URL = 'http://weibo.com/aj/v6/comment/big?ajwvr=6&id={}&&page={}' +BASE_URL = 'http://weibo.com/aj/v6/comment/big?ajwvr=6&id={}&page={}' @app.task(ignore_result=True) diff --git a/tasks/dialogue.py b/tasks/dialogue.py index 0437e17f..2406039b 100644 --- a/tasks/dialogue.py +++ b/tasks/dialogue.py @@ -5,11 +5,8 @@ from db.dao import (WbDataOper, CommonOper) import time -# from .comment import crawl_comment_by_page - -# unk:type is_more AJAX_URL = 'https://weibo.com/aj/v6/comment/conversation?ajwvr=6&cid={}&type=small&ouid=&cuid=&is_more=1&__rnd={}' -COMMENT_URL = 'http://weibo.com/aj/v6/comment/big?ajwvr=6&id={}&&page={}' +COMMENT_URL = 'http://weibo.com/aj/v6/comment/big?ajwvr=6&id={}&page={}' @app.task(ignore_result=True) @@ -19,9 +16,17 @@ def crawl_dialogue_by_comment_id(cid, mid): dialogue_url = AJAX_URL.format(cid, cur_time) html = get_page(dialogue_url, auth_level=2, is_ajax=True) - dialogue_data = dialogue.get_dialogue(html, mid, cid) + dialogue_data, uids = dialogue.get_dialogue(html, mid, cid) + if dialogue_data: + CommonOper.add_one(dialogue_data) - CommonOper.add_one(dialogue_data) + if uids: + for uid in uids: + # crawl_person_infos_not_in_seed_ids(uid) + app.send_task('tasks.user.crawl_person_infos_not_in_seed_ids', + args=(uid,), + queue='user_crawler', + routing_key='for_user_info') @app.task(ignore_result=True) @@ -49,7 +54,9 @@ def crawl_dialogue(mid): for page_num in range(2, limit): # crawl_dialogue_by_comment_page(mid, page_num) - app.send_task('tasks.comment.crawl_dialogue_by_comment_page', args=(mid, page_num), queue='comment_page_crawler', + app.send_task('tasks.comment.crawl_dialogue_by_comment_page', + args=(mid, page_num), + queue='comment_page_crawler', routing_key='comment_page_info') @@ -58,5 +65,7 @@ def execute_dialogue_task(): weibo_datas = WbDataOper.get_weibo_dialogue_not_crawled() for weibo_data in weibo_datas: # crawl_dialogue(weibo_data.weibo_id) - app.send_task('tasks.dialogue.crawl_dialogue', args=(weibo_data.weibo_id,), queue='dialogue_crawler', + app.send_task('tasks.dialogue.crawl_dialogue', + args=(weibo_data.weibo_id,), + queue='dialogue_crawler', routing_key='dialogue_info') diff --git a/tasks/user.py b/tasks/user.py index 0f55e9cc..2f6680ab 100644 --- a/tasks/user.py +++ b/tasks/user.py @@ -2,7 +2,7 @@ from db.dao import SeedidsOper from page_get import ( get_fans_or_followers_ids, - get_profile + get_profile, get_user_profile ) @@ -43,6 +43,17 @@ def crawl_person_infos(uid): routing_key='for_fans_followers') +@app.task(ignore_result=True) +def crawl_person_infos_not_in_seed_ids(uid): + """ + Crawl user info not in seed_ids + """ + if not uid: + return + + get_user_profile(uid) + + @app.task(ignore_result=True) def execute_user_task(): seeds = SeedidsOper.get_seed_ids() @@ -50,4 +61,3 @@ def execute_user_task(): for seed in seeds: app.send_task('tasks.user.crawl_person_infos', args=(seed.uid,), queue='user_crawler', routing_key='for_user_info') -