Skip to content

Commit

Permalink
Merge pull request #67 from yezuoxian/dialogue
Browse files Browse the repository at this point in the history
将dialogue中出现的用户id加入爬取队列user_crawler中;weibo_dialogue增加dialogue_rounds字段,保存对话轮次。
  • Loading branch information
ResolveWang committed Jan 19, 2018
2 parents bbd6496 + 63c3ba4 commit 5fc365b
Show file tree
Hide file tree
Showing 8 changed files with 56 additions and 23 deletions.
2 changes: 1 addition & 1 deletion db/models.py
Expand Up @@ -58,7 +58,7 @@ def __init__(self, uid, other_id, type):
self.type = type


class WeiboDialoggue(Base):
class WeiboDialogue(Base):
__table__ = weibo_dialogue

def __repr__(self):
Expand Down
1 change: 1 addition & 0 deletions db/tables.py
Expand Up @@ -116,6 +116,7 @@
Column("dialogue_id", String(50), unique=True),
Column("weibo_id", String(200)),
Column("dialogue_cont", Text),
Column("dialogue_rounds", INTEGER),
)

__all__ = ['login_info', 'wbuser', 'seed_ids', 'keywords', 'weibo_data', 'keywords_wbdata', 'weibo_comment',
Expand Down
2 changes: 1 addition & 1 deletion page_get/__init__.py
@@ -1,4 +1,4 @@
from .basic import get_page
from .status import get_cont_of_weibo
from .user import (
get_profile, get_fans_or_followers_ids)
get_profile, get_fans_or_followers_ids, get_user_profile)
16 changes: 14 additions & 2 deletions page_get/user.py
Expand Up @@ -7,7 +7,6 @@
from page_parse.user import (
enterprise, person, public)


BASE_URL = 'http://weibo.com/p/{}{}/info?mod=pedit_more'


Expand Down Expand Up @@ -102,6 +101,20 @@ def get_profile(user_id):
return user, is_crawled


def get_user_profile(user_id):
"""
:param user_id: uid
:return: user info and is crawled or not
"""
user = UserOper.get_user_by_uid(user_id)

if user:
storage.info('user {id} has already crawled'.format(id=user_id))
else:
user = get_url_from_web(user_id)
return user


def get_fans_or_followers_ids(user_id, crawl_type):
"""
Get followers or fans
Expand Down Expand Up @@ -133,4 +146,3 @@ def get_fans_or_followers_ids(user_id, crawl_type):
cur_page += 1

return user_ids

17 changes: 9 additions & 8 deletions page_parse/dialogue.py
Expand Up @@ -3,7 +3,7 @@
from bs4 import BeautifulSoup

from logger import parser
from db.models import WeiboDialoggue
from db.models import WeiboDialogue
from decorators import parse_decorator
from .comment import get_html_cont

Expand Down Expand Up @@ -44,20 +44,21 @@ def get_dialogue(html, wb_id, cid):
"""
cont = get_html_cont(html)
soup = BeautifulSoup(cont, 'lxml')

# print(soup.prettify())
dialogue_list = []
dialogues = soup.find_all(attrs={'class': 'WB_text'})
if len(dialogues) < 2:
return None
weibo_dialogue = WeiboDialoggue()
return None, None
weibo_dialogue = WeiboDialogue()
uids = []
try:
for dialogue in dialogues:
# print(dialogue.text.strip())
dialogue_list.append(dialogue.text.strip())
user_id = dialogue.find('a').get('usercard')[3:]
uids.append(user_id)
dialogue_list.append({'uid': user_id, 'text': dialogue.text.strip()})
weibo_dialogue.weibo_id = wb_id
weibo_dialogue.dialogue_id = cid
weibo_dialogue.dialogue_cont = json.dumps(dialogue_list)
weibo_dialogue.dialogue_rounds = len(dialogues)
except Exception as e:
parser.error('解析对话失败,具体信息是{}'.format(e))
return weibo_dialogue
return weibo_dialogue, uids
2 changes: 1 addition & 1 deletion tasks/comment.py
Expand Up @@ -6,7 +6,7 @@
WbDataOper, CommentOper)


BASE_URL = 'http://weibo.com/aj/v6/comment/big?ajwvr=6&id={}&&page={}'
BASE_URL = 'http://weibo.com/aj/v6/comment/big?ajwvr=6&id={}&page={}'


@app.task(ignore_result=True)
Expand Down
25 changes: 17 additions & 8 deletions tasks/dialogue.py
Expand Up @@ -5,11 +5,8 @@
from db.dao import (WbDataOper, CommonOper)
import time

# from .comment import crawl_comment_by_page

# unk:type is_more
AJAX_URL = 'https://weibo.com/aj/v6/comment/conversation?ajwvr=6&cid={}&type=small&ouid=&cuid=&is_more=1&__rnd={}'
COMMENT_URL = 'http://weibo.com/aj/v6/comment/big?ajwvr=6&id={}&&page={}'
COMMENT_URL = 'http://weibo.com/aj/v6/comment/big?ajwvr=6&id={}&page={}'


@app.task(ignore_result=True)
Expand All @@ -19,9 +16,17 @@ def crawl_dialogue_by_comment_id(cid, mid):
dialogue_url = AJAX_URL.format(cid, cur_time)

html = get_page(dialogue_url, auth_level=2, is_ajax=True)
dialogue_data = dialogue.get_dialogue(html, mid, cid)
dialogue_data, uids = dialogue.get_dialogue(html, mid, cid)
if dialogue_data:
CommonOper.add_one(dialogue_data)

CommonOper.add_one(dialogue_data)
if uids:
for uid in uids:
# crawl_person_infos_not_in_seed_ids(uid)
app.send_task('tasks.user.crawl_person_infos_not_in_seed_ids',
args=(uid,),
queue='user_crawler',
routing_key='for_user_info')


@app.task(ignore_result=True)
Expand Down Expand Up @@ -49,7 +54,9 @@ def crawl_dialogue(mid):

for page_num in range(2, limit):
# crawl_dialogue_by_comment_page(mid, page_num)
app.send_task('tasks.comment.crawl_dialogue_by_comment_page', args=(mid, page_num), queue='comment_page_crawler',
app.send_task('tasks.comment.crawl_dialogue_by_comment_page',
args=(mid, page_num),
queue='comment_page_crawler',
routing_key='comment_page_info')


Expand All @@ -58,5 +65,7 @@ def execute_dialogue_task():
weibo_datas = WbDataOper.get_weibo_dialogue_not_crawled()
for weibo_data in weibo_datas:
# crawl_dialogue(weibo_data.weibo_id)
app.send_task('tasks.dialogue.crawl_dialogue', args=(weibo_data.weibo_id,), queue='dialogue_crawler',
app.send_task('tasks.dialogue.crawl_dialogue',
args=(weibo_data.weibo_id,),
queue='dialogue_crawler',
routing_key='dialogue_info')
14 changes: 12 additions & 2 deletions tasks/user.py
Expand Up @@ -2,7 +2,7 @@
from db.dao import SeedidsOper
from page_get import (
get_fans_or_followers_ids,
get_profile
get_profile, get_user_profile
)


Expand Down Expand Up @@ -43,11 +43,21 @@ def crawl_person_infos(uid):
routing_key='for_fans_followers')


@app.task(ignore_result=True)
def crawl_person_infos_not_in_seed_ids(uid):
"""
Crawl user info not in seed_ids
"""
if not uid:
return

get_user_profile(uid)


@app.task(ignore_result=True)
def execute_user_task():
seeds = SeedidsOper.get_seed_ids()
if seeds:
for seed in seeds:
app.send_task('tasks.user.crawl_person_infos', args=(seed.uid,), queue='user_crawler',
routing_key='for_user_info')

0 comments on commit 5fc365b

Please sign in to comment.