Permalink
Browse files

Merge pull request #87 from goodbest/master

抓评论时加入表情功能
  • Loading branch information...
ResolveWang committed Apr 20, 2018
2 parents 61de71e + 1a8c0cb commit 2ebee1a923177d0dd2527c778fea0a32d7b415fc
Showing with 65 additions and 4 deletions.
  1. +1 −1 db/basic.py
  2. +1 −0 db/tables.py
  3. +41 −3 page_parse/comment.py
  4. +1 −0 utils/emoji_ios6.json
  5. +21 −0 utils/parse_emoji.py
@@ -11,7 +11,7 @@
def get_engine():
args = get_db_args()
password = os.getenv('DB_PASS', args['password'])
connect_str = "{}+pymysql://{}:{}@{}:{}/{}?charset=utf8".format(args['db_type'], args['user'], password,
connect_str = "{}+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4".format(args['db_type'], args['user'], password,
args['host'], args['port'], args['db_name'])
engine = create_engine(connect_str, encoding='utf-8')
return engine
@@ -85,6 +85,7 @@
Column("id", INTEGER, primary_key=True, autoincrement=True),
Column("comment_id", String(50), unique=True),
Column("comment_cont", Text),
Column("comment_screen_name", Text),
Column("weibo_id", String(200)),
Column("user_id", String(20)),
Column("create_time", String(200)),
@@ -5,7 +5,7 @@
from logger import parser
from db.models import WeiboComment
from decorators import parse_decorator
from utils import parse_emoji
@parse_decorator('')
def get_html_cont(html):
@@ -63,14 +63,52 @@ def get_comment_list(html, wb_id):
if not cont:
return list()
soup = BeautifulSoup(cont, 'html.parser')
soup = BeautifulSoup(cont, 'html5lib')
comment_list = list()
comments = soup.find(attrs={'node-type': 'comment_list'}).find_all(attrs={'class': 'list_li S_line1 clearfix'})
for comment in comments:
wb_comment = WeiboComment()
try:
wb_comment.comment_cont = comment.find(attrs={'class': 'WB_text'}).text.strip()
cont = []
first_author=True
first_colon=True
for content in comment.find(attrs={'class': 'WB_text'}).contents:
if not content:
continue
if content.name =='a':
if first_author:
first_author=False
continue
else:
if content.text:
cont.append(content.text)
elif content.name=='img':
img_title = content.get('title', '')
if img_title=='':
img_title = content.get('alt', '')
if img_title=='':
img_src = content.get('src','')
img_src = img_src.split('/')[-1].split('.',1)[0]
try:
img_title = parse_emoji.softband_to_utf8(img_src)
except Exception as e:
parser.error('解析表情失败,具体信息是{},{}'.format(e, comment))
img_title = ''
cont.append(img_title)
else:
if first_colon:
if content.find('')==0:
cont.append(content.replace('','',1))
first_colon=False
else:
cont.append(content)
wb_comment.comment_cont = ''.join(cont)
wb_comment.comment_screen_name =comment.find(attrs={'class': 'WB_text'}).find('a').text
wb_comment.comment_id = comment['comment_id']
# TODO 将wb_comment.user_id加入待爬队列(seed_ids)
wb_comment.user_id = comment.find(attrs={'class': 'WB_text'}).find('a').get('usercard')[3:]

Large diffs are not rendered by default.

Oops, something went wrong.
@@ -0,0 +1,21 @@
import json
def load_emoji_map(fn = 'utils/emoji_ios6.json'):
json_data = json.load(open(fn, encoding='utf-8'))
sb_dict = {}
for m in json_data:
sb_dict[m['sb'].lower()]=m['utf8']
return sb_dict
def softband_to_utf8(emoji):
hex_emoji = sb_dict.get(emoji.lower(), '')
if hex_emoji:
return bytes.fromhex(hex_emoji).decode('utf-8')
else:
return ''
sb_dict = load_emoji_map()

0 comments on commit 2ebee1a

Please sign in to comment.