Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5d2e97e
commit d0968bf
Showing
52 changed files
with
5,810 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
import json | ||
import requests | ||
import sys | ||
reload(sys) | ||
sys.setdefaultencoding("utf-8") | ||
|
||
from difflib import SequenceMatcher | ||
|
||
def similar(a, b): | ||
return SequenceMatcher(None, a, b).ratio() | ||
|
||
url = "http://eventdata.utdallas.edu/api/data?api_key=EmNc8Pbp5XEUIuzlIdxqVlP5g6S1KlNe&query={\"date8\":{\"$gt\":\"20180228\", \"$lt\": \"20180401\"}}" | ||
|
||
response = requests.get(url) | ||
|
||
print response | ||
|
||
data = json.loads(response.content) | ||
|
||
#print response.content | ||
|
||
print "Data Loading Complete. Entry count ", len(data["data"]) | ||
|
||
document_to_event_map = {} | ||
|
||
for event in data['data']: | ||
|
||
doc_id = event["id"].split("_")[0] | ||
if doc_id not in document_to_event_map: | ||
document_to_event_map[doc_id] = [] | ||
|
||
document_to_event_map[doc_id].append(event) | ||
|
||
|
||
print len(document_to_event_map) | ||
|
||
count_map = {} | ||
|
||
for doc in document_to_event_map: | ||
if len(document_to_event_map[doc]) not in count_map: | ||
count_map[len(document_to_event_map[doc])] = 0 | ||
count_map[len(document_to_event_map[doc])] += 1 | ||
|
||
|
||
print count_map | ||
|
||
root_code_match = 0 | ||
root_code_not_found = 0 | ||
event_match = 0 | ||
doc_count = 0 | ||
output_file = open("events.txt", "w+") | ||
|
||
similar_count = {} | ||
with open("output.txt", "w+") as out: | ||
for doc_id in document_to_event_map: | ||
if len(document_to_event_map[doc_id]) == 3: | ||
events = document_to_event_map[doc_id] | ||
#print events[0] | ||
if 'source' not in events[0] or 'target' not in events[0]: | ||
continue | ||
|
||
if 'source' not in events[1] or 'target' not in events[1]: | ||
continue | ||
|
||
if 'source' not in events[2] or 'target' not in events[2]: | ||
continue | ||
|
||
url = "http://eventdata.utdallas.edu/api/article?api_key=EmNc8Pbp5XEUIuzlIdxqVlP5g6S1KlNe&doc_id=" + doc_id | ||
|
||
response = requests.get(url) | ||
|
||
data = json.loads(response.content) | ||
sentences = data['data'] | ||
|
||
for i in range(0, len(events)): | ||
print >> out, events[i]['code'], events[i]['source'], events[i]['target'] | ||
sent_id = events[i]['id'].split("_")[1] | ||
j = 0 | ||
while sent_id != str(sentences[j]['sentence_id']): | ||
j += 1 | ||
print >> out, sent_id, ": ", sentences[j]["sentence"] | ||
|
||
from newsplease import NewsPlease | ||
|
||
print >> out, "================= FULL ARTICLE ===============" | ||
article = NewsPlease.from_url(events[0]['url']) | ||
print >> out, events[0]['url'] | ||
print >> out, article.text | ||
|
||
doc_count += 1 | ||
out.close() | ||
print doc_count | ||
print root_code_match | ||
print event_match | ||
|
||
print similar_count | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
class NewsArticle(object): | ||
""" | ||
Class representing a single news article containing all the information that news-please can extract. | ||
""" | ||
authors = [] | ||
date_download = None | ||
date_modify = None | ||
date_publish = None | ||
description = None | ||
filename = None | ||
image_url = None | ||
language = None | ||
localpath = None | ||
source_domain = None | ||
text = None | ||
title = None | ||
title_page = None | ||
title_rss = None | ||
url = None | ||
|
||
def get_dict(self): | ||
""" | ||
Get the dict of the instance of this class. | ||
:return: | ||
""" | ||
return { | ||
'authors': self.authors, | ||
'date_download': self.date_download, | ||
'date_modify': self.date_modify, | ||
'date_publish': self.date_publish, | ||
'description': self.description, | ||
'filename': self.filename, | ||
'image_url': self.image_url, | ||
'language': self.language, | ||
'localpath': self.localpath, | ||
'source_domain': self.source_domain, | ||
'text': self.text, | ||
'title': self.title, | ||
'title_page': self.title_page, | ||
'title_rss': self.title_rss, | ||
'url': self.url | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
import datetime | ||
import os | ||
import sys | ||
import urllib | ||
|
||
from six.moves import urllib | ||
|
||
sys.path.append(os.path.dirname(os.path.realpath(__file__))) | ||
|
||
from newsplease.pipeline.extractor import article_extractor | ||
from newsplease.crawler.items import NewscrawlerItem | ||
from dotmap import DotMap | ||
from newsplease.pipeline.pipelines import ExtractedInformationStorage | ||
from newsplease.crawler.simple_crawler import SimpleCrawler | ||
|
||
|
||
class NewsPlease: | ||
""" | ||
Access news-please functionality via this interface | ||
""" | ||
|
||
@staticmethod | ||
def from_warc(warc_record): | ||
""" | ||
Extracts relevant information from a WARC record. This function does not invoke scrapy but only uses the article | ||
extractor. | ||
:return: | ||
""" | ||
html = str(warc_record.raw_stream.read()) | ||
url = warc_record.rec_headers.get_header('WARC-Target-URI') | ||
download_date = warc_record.rec_headers.get_header('WARC-Date') | ||
article = NewsPlease.from_html(html, url=url, download_date=download_date) | ||
return article | ||
|
||
@staticmethod | ||
def from_html(html, url=None, download_date=None): | ||
""" | ||
Extracts relevant information from an HTML page given as a string. This function does not invoke scrapy but only | ||
uses the article extractor. If you have the original URL make sure to provide it as this helps NewsPlease | ||
to extract the publishing date and title. | ||
:param html: | ||
:param url: | ||
:return: | ||
""" | ||
extractor = article_extractor.Extractor( | ||
['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor']) | ||
|
||
title_encoded = ''.encode() | ||
if not url: | ||
url = '' | ||
|
||
# if an url was given, we can use that as the filename | ||
filename = urllib.parse.quote_plus(url) + '.json' | ||
|
||
item = NewscrawlerItem() | ||
item['spider_response'] = DotMap() | ||
item['spider_response'].body = html | ||
item['url'] = url | ||
item['source_domain'] = urllib.parse.urlparse(url).hostname.encode() if url != '' else ''.encode() | ||
item['html_title'] = title_encoded | ||
item['rss_title'] = title_encoded | ||
item['local_path'] = None | ||
item['filename'] = filename | ||
item['download_date'] = download_date | ||
item['modified_date'] = None | ||
item = extractor.extract(item) | ||
|
||
tmp_article = ExtractedInformationStorage.extract_relevant_info(item) | ||
final_article = ExtractedInformationStorage.convert_to_class(tmp_article) | ||
# final_article = DotMap(tmp_article) | ||
return final_article | ||
|
||
@staticmethod | ||
def from_url(url): | ||
""" | ||
Crawls the article from the url and extracts relevant information. | ||
:param url: | ||
:return: A dict containing all the information of the article. Else, None. | ||
""" | ||
articles = NewsPlease.from_urls([url]) | ||
if url in articles.keys(): | ||
return articles[url] | ||
else: | ||
return None | ||
|
||
@staticmethod | ||
def from_urls(urls): | ||
""" | ||
Crawls articles from the urls and extracts relevant information. | ||
:param urls: | ||
:return: A dict containing given URLs as keys, and extracted information as corresponding values. | ||
""" | ||
results = {} | ||
download_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') | ||
|
||
if len(urls) == 0: | ||
pass | ||
elif len(urls) == 1: | ||
url = urls[0] | ||
html = SimpleCrawler.fetch_url(url) | ||
results[url] = NewsPlease.from_html(html, url, download_date) | ||
else: | ||
results = SimpleCrawler.fetch_urls(urls) | ||
for url in results: | ||
results[url] = NewsPlease.from_html(results[url], url, download_date) | ||
|
||
return results | ||
|
||
@staticmethod | ||
def from_file(path): | ||
""" | ||
Crawls articles from the urls and extracts relevant information. | ||
:param path: path to file containing urls (each line contains one URL) | ||
:return: A dict containing given URLs as keys, and extracted information as corresponding values. | ||
""" | ||
with open(path) as f: | ||
content = f.readlines() | ||
content = [x.strip() for x in content] | ||
urls = list(filter(None, content)) | ||
|
||
return NewsPlease.from_urls(urls) |
Oops, something went wrong.