-
Notifications
You must be signed in to change notification settings - Fork 232
/
pipelines.py
84 lines (77 loc) · 3.41 KB
/
pipelines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import pymysql
from pymysql import connections
from baidu_baike import settings
class BaiduBaikePipeline(object):
def __init__(self):
self.conn = pymysql.connect(
host=settings.HOST_IP,
# port=settings.PORT,
user=settings.USER,
passwd=settings.PASSWD,
db=settings.DB_NAME,
charset='utf8mb4',
use_unicode=True
)
self.cursor = self.conn.cursor()
self.cursor.execute("SELECT MAX(title_id) FROM lemmas")
max_id = self.cursor.fetchall()[0]
if None in max_id:
self.count = 1
else:
self.count = max_id[0]
def process_item(self, item, spider):
# process info for actor
title = str(item['title']).decode('utf-8')
title_id = str(item['title_id']).decode('utf-8')
abstract = str(item['abstract']).decode('utf-8')
infobox = str(item['infobox']).decode('utf-8')
subject = str(item['subject']).decode('utf-8')
disambi = str(item['disambi']).decode('utf-8')
redirect = str(item['redirect']).decode('utf-8')
curLink = str(item['curLink']).decode('utf-8')
interPic = str(item['interPic']).decode('utf-8')
interLink = str(item['interLink']).decode('utf-8')
exterLink = str(item['exterLink']).decode('utf-8')
relateLemma = str(item['relateLemma']).decode('utf-8')
all_text = str(item['all_text']).decode('utf-8').encode('utf-8')
# self.cursor.execute("SELECT disambi FROM lemmas;")
# disambi_list = self.cursor.fetchall()
# if (disambi,) not in disambi_list :
# self.cursor.execute("SELECT MAX(title_id) FROM lemmas")
# result = self.cursor.fetchall()[0]
self.count += 1
title_id = self.count
sql = """
INSERT INTO lemmas(title, title_id, abstract, infobox, subject, disambi, redirect, curLink, interPic, interLink, exterLink, relateLemma, all_text ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
try:
# disambi_list = self.cursor.fetchall()
# if (disambi, ) in disambi_list:
# print ("result: ", disambi)
self.cursor.execute(sql, (title, title_id, abstract, infobox, subject, disambi, redirect, curLink, interPic, interLink, exterLink, relateLemma, all_text ))
self.conn.commit()
# self.cursor.execute("SELECT disambi FROM lemmas" )
except Exception as e:
print("#"*20, "\nAn error when insert into mysql!!\n")
print("curLink: ", curLink, "\n")
print(e, "\n", "#"*20)
try:
all_text = str('None').decode('utf-8').encode('utf-8')
self.cursor.execute(sql, (title, title_id, abstract, infobox, subject, disambi, redirect, curLink, interPic, interLink, exterLink, relateLemma, all_text ))
self.conn.commit()
except Exception as f:
print("Error without all_text!!!")
return item
def close_spider(self, spider):
self.conn.close()