-
Notifications
You must be signed in to change notification settings - Fork 7
/
pyScrape_CBCNews.py
300 lines (289 loc) · 13.7 KB
/
pyScrape_CBCNews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
##*********************HEADER*********************##
##Developer : Justin Suelflow
##Date : 4/11/2016
##Program Name : pyScrape_CBCNews
##Description : Loop through last 2 days worth of CBC News articles on 10 pages of Google News to pull nouns
##Python Version: 2.7.11
##Prereqs Knowledge: Python, HTML, CSS, XPath
##Prereqs Hardware:
##Prereqs Software: Python, pip, Python-Dev
## Unix install command "sudo apt-get install"
##Python Libraries: LXML, requests, csv, re, datetime, numpy, os, nltk (numpy is prereq for nltk)
## Unix install python lib command: "sudo pip install"
##Needed Python file: pyTimer.py
## pyTimer.py file is found at https://github.com/Test-BMOHB/Media-Monitoring/blob/master/pyTimer.py
##Log file saved at: /Logs/pylog_CBCNews.txt
##CSV file saved at: /Scrapes/mmddyyyy_CBCNews_Scrape.csv
##Run command: sudo python pyScrape_CBCNews.py
##Static variables: '/Logs/pylog_CBCNews.txt'
## header row in CSV, mainURL, mainXPath, paraXPath
##-----------------------------------------------------------------------------
## Version | mm/dd/yyyy | User | Changes
## 1 04/11/2016 Justin Suelflow Initial Version to grab names from current day
## 1.1 04/11/2016 Justin Suelflow Changed log file path
## 1.2 04/12/2016 Justin Suelflow Changed from 7 days to current day minus 1 day because runtime is long
## Changed to only look for CBC News website
## 1.3 04/12/2016 Justin Suelflow Added code to break from page incrementing
## 2 04/13/2016 Justin Suelflow Bug fixes (Pager not requesting new Google News pages, break when no links found)
## Add function to gather links and deduplicate links before getting HTML content
## 2.1 04/18/2016 Justin Suelflow mainXPath change and log writing changes
## 2.2 04/19/2016 Justin Suelflow Set serach string to pull 100 articles at once instead of default(10)
## Call scrapeinfo function inbetween GN page calls to point to other websites
##-----------------------------------------------------------------------------
##*********************END HEADER*********************##
##*********************IMPORT*********************##
## Import needed python libraries
## Libraries must be installed using 'pip install'
## pyTimer is not installed using pip, the standalone file needs to be placed in the same location as this code file
from lxml import html
from lxml.etree import tostring
from datetime import datetime, timedelta
import requests, csv, re, time, nltk, numpy, pyTimer, os.path
##*********************END IMPORT*********************##
##*********************FUNCTIONS*********************##
## Function : removeDuplicates
## Description : Remove exact duplicate list entries
## Parameters : dedup = list type
## Returns : list
def removeDuplicates(dedup):
finalList = []
for x in dedup:
if x not in finalList:
finalList.append(x)
return finalList
## Function : writeToLog
## Description : Write text to log
## Parameters : text = string type
## Returns :
def writeToLog(text):
## Open a log file and append to the end of the log
## If no log file is in directory, this will automatically create it
logFile = open('/Logs/pylog_CBCNews.txt','a')
logFile.write(text)
## Close log file
logFile.close()
## Function : createCSV
## Description : Writes list to a CSV file
## Parameters : liCSV = list type, f1 = file type
## Returns :
def createCSV(liCSV, f1):
writeToLog("Writing to CSV\n")
## Use the comma as a delimiter
writer = csv.writer(f1, delimiter=',', quoting=csv.QUOTE_NONE, escapechar=' ')
## Add a header row to the CSV
writer.writerow(["Name","Link"])
## Loop through all elements in the list
for i in liCSV:
rowStr = ''
## Some elements are lists so it is needed to loop through each element again
for e in i:
rowStr = rowStr + str(e)
rowStr = rowStr + ','
## Take the last comma off of the rowStr to finish the row
rowStr = rowStr[:-1]
## Write the row to the CSV file
writer.writerow([rowStr])
## Function : gatherLinks
## Description : Scrapes links from the mainContent
## Parameters : mainContent = string type, mainXPath = string type
## Returns : list
def gatherLinks(mainContent, mainXPath):
liLinks = []
currDate = datetime.now()
## Make currDate minus 1 day
currDate = currDate - timedelta(days=1)
currDate = currDate.strftime('%b %d, %Y')
currDate = time.strptime(currDate, "%b %d, %Y")
mainLinksXPath = mainContent.xpath(mainXPath)
## Loop through elements in mainLinksXPath
for mainLinksElements in mainLinksXPath:
links = mainLinksElements.xpath('.//a[@class="l _HId"]')
for link in links:
## Get the href parameter from the anchor tags
link = link.get('href')
d = ''
dates = mainLinksElements.xpath('.//span[@class="f nsa _uQb"]')
for date in dates:
date = date.text
try:
## Make the date string into a time object
d = time.strptime(date, "%b %d, %Y")
## If the date string is not is a format that is accepted, look for 'ago' in the string. Example date string: 30 minutes ago
except:
if 'ago' in date:
## Make the date string equal today's date
date = datetime.now()
date = date.strftime('%b %d, %Y')
d = time.strptime(date, "%b %d, %Y")
## If the date for the link is within the timeframe, add link to the liLinks list
if currDate <= d:
liLinks.append(link)
else:
break
liLinks = list(set(liLinks))
return liLinks
## Function : scrapeInfo
## Description : Scrapes HTML content from all articles in list
## Parameters : link = string type, paraXPath = string type
## Returns : list
def scrapeInfo(liLinks, paraXPath):
li = []
for link in liLinks:
try:
## Do a HTTP request on the article link
linkRequest = requests.get(link)
## Translate the content from the request to HTML
linkContent = html.fromstring(linkRequest.content)
## Find the paraXpath in the article
linkXPath = linkContent.xpath(paraXPath)
pageContent = ''
## Loop through elements in lXPath
for linkXElement in linkXPath:
text = tostring(linkXElement)
## Delete all icons and small emojis from HTML text
icons = re.findall(r'&#\d*;', text)
icons = list(set(icons))
for icon in icons:
text = re.sub(icon, '', text)
## Delete all HTML tags from HTML text
tags = re.findall('<[^>]+>', text)
tags = list(set(tags))
for tag in tags:
text = text.replace(tag, '')
htmlSyntax = ['\t','\n']
for hs in htmlSyntax:
text = text.replace(hs, '')
pageContent = pageContent + text
## Add HTML content and the article link to a list
li.append([pageContent,link])
except:
writeToLog("Link: " + link + " was not scraped due to an error.")
return li
## Function : extractNames
## Description : Extracts names from html in list
## Parameters : li = list type
## Returns : list
def extractNames(li):
finList = []
## Loop through the list that has the HTML page content
for a in li:
## Tokenize the HTML text into smaller blocks of text
for send in nltk.sent_tokenize(str(a)):
smLi = []
## Tokenize the smaller blocks of text in individual words and then add a Part-of-Speech(POS) tag
for index, chunk in enumerate(nltk.pos_tag(nltk.word_tokenize(send))):
## If the POS tag is NNP (noun)
if 'NNP' in chunk[1]:
## If the each character in the word is an alphanumeric character and there are more than 2 characters in the word
if(len(' '.join(e for e in chunk[0] if e.isalnum())) > 2):
## Append the list with the index of the word, chunk that has the POS tag and the link
smLi.append([index, chunk, a[1]])
finList.append(smLi)
nameLi = []
for f in finList:
if len(f) > 0:
strName = ''
for index, i in enumerate(f):
## If strName is blank, declare it with the current word in the list
if strName == '':
strName = i[1][0]
## If index+1 is not at the end of the list, continue
if (index + 1) < len(f):
## If the index is a consecutive index, add to the strName
if i[0] + 1 == f[index + 1][0]:
strName = strName + ' ' + f[index + 1][1][0]
## If the index is not a consecutive index, append strName to the nameLi list with the article link and make the strName blank
else:
if ' ' in strName:
nameLi.append([strName, i[2]])
strName = ''
return nameLi
##*********************MAIN FUNCTION*********************##
## Function : main
## Description : Opens file, http request mainURL and call other functions
## Parameters : mainURLList = list type
## Returns :
def main(mainURL, mainXPath, paraXPath, fileName, queryLi):
## Automatically creates file if it does not exist
with open(fileName,'w') as scrapeFile:
nameLi = []
liLinks = []
htmlLi = []
## Set header variable to trick the http request to think a web browser is opening the page
header = {'User-Agent': 'Mozilla/Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
## Loop through the search queries
for q in queryLi:
increment = 0
## Loop through 10 pages of Google News
while increment < 11:
url = mainURL + q
if increment == 0:
## Http request the mainURL with a header variable
mainRequest = requests.get(url, headers=header)
else:
url = url + "&start=" + str(increment * 100)
mainRequest = requests.get(url, headers=header)
if mainRequest.status_code != requests.codes.ok:
break
else:
## Translate mainRequest content into HTML
mainContent = html.fromstring(mainRequest.content)
writeToLog("Gathering links from URL: " + url + "\n")
liIncrementLinks = gatherLinks(mainContent, mainXPath)
## Break pager loop if no links were found. These either suggests that the date is no longer current on the articles or the search string produced no articles
if len(liIncrementLinks) == 0:
break
liLinks.extend(liIncrementLinks)
## Sleep for 7 seconds in between Google News pages
time.sleep(7)
fakeRequest = requests.get("http://www.bing.com")
increment = increment + 1
if mainRequest.status_code != requests.codes.ok:
writeToLog(url + " has status code of: " + str(mainRequest.status_code) + "\n")
break
else:
htmlLi.extend(scrapeInfo(liLinks, paraXPath))
## Sleep for 28 seconds in between search queries
time.sleep(28)
## If the list is empty don't go through the rest of the process
if len(htmlLi) > 0:
writeToLog("Extracting Names\n")
nameLi.extend(extractNames(htmlLi))
writeToLog("Removing Duplicates\n")
nameLi = removeDuplicates(nameLi)
writeToLog("Creating CSV\n")
createCSV(nameLi, scrapeFile)
##*********************END MAIN FUNCTION*********************##
##*********************END FUNCTIONS*********************##
##*********************PROGRAM*********************##
## If statement makes this program standalone
## Do not need this if statement if another program will be calling above functions
if __name__ == "__main__":
## Create start time
startTime = pyTimer.startTimer()
## Try to download NLTK packages
try:
punktDL = nltk.download('punkt')
aptDL = nltk.download('averaged_perceptron_tagger')
except:
writeToLog('NLTK Punkt and Averaged_Perceptron_tagger need to be installed')
currDate = datetime.now()
fileDate = currDate.strftime('%m%d%Y')
writeToLog('*****************************' + fileDate + '*****************************\n')
fileName = '/Scrapes/' + fileDate + '_CBCNews_Scrape.csv'
## Declare list of search queries
queryLi = ['money+laundering', 'lawsuit', 'scandal', 'fraud', 'illegal', 'criminal', 'crime', 'conviction', 'guilt', 'arrest', 'testify', 'corrupt', 'accused', 'kickback', 'investigate', 'investigation', 'bribe', 'grow-op', 'unethical', 'ponzi', 'terrorist', 'terrorism']
mainURL = 'https://www.google.com/search?source=lnms&tbm=nws&tbs=qdr:w,sbd:1&num=100&q=site:www.cbc.ca/news+'
mainXPath = '//*[@class="_cnc"]'
paraXPath = '//p'
## If the NLTK packages are downloaded, run the main program
if punktDL and aptDL:
main(mainURL, mainXPath, paraXPath, fileName, queryLi)
else:
writeToLog('NLTK Punkt and Averaged_Perceptron_tagger need to be downloaded first.')
writeToLog('Please sudo python and run nltk.download("punkt") and nltk.download("averaged_perceptron_tagger")')
## Find total time in seconds of program run
pName = os.path.basename(__file__)
endTime = pyTimer.endTimer(startTime, pName)
writeToLog("Program took " + endTime + " to complete.\n")
##*********************END PROGRAM*********************##