Skip to content

Commit

Permalink
Changes mongo server IP
Browse files Browse the repository at this point in the history
  • Loading branch information
Sayeedsalam committed May 14, 2018
1 parent 5d2e97e commit d0968bf
Show file tree
Hide file tree
Showing 52 changed files with 5,810 additions and 2 deletions.
19 changes: 18 additions & 1 deletion Analysis.py
@@ -1,6 +1,10 @@
import json
import requests

from difflib import SequenceMatcher

def similar(a, b):
return SequenceMatcher(None, a, b).ratio()

url = "http://eventdata.utdallas.edu/api/data?api_key=EmNc8Pbp5XEUIuzlIdxqVlP5g6S1KlNe&query={\"date8\":{\"$gt\":\"20180228\", \"$lt\": \"20180401\"}}"

Expand Down Expand Up @@ -41,6 +45,9 @@
root_code_not_found = 0
event_match = 0
doc_count = 0
output_file = open("events.txt", "w+")

similar_count = {}

for doc_id in document_to_event_map:
if len(document_to_event_map[doc_id]) == 2:
Expand All @@ -64,7 +71,7 @@

response = requests.get(url)

print response.content
#print response.content

data = json.loads(response.content)
sentences = data['data']
Expand All @@ -80,15 +87,25 @@
print sent1_id, ":", sent1
print sent2_id, ":", sent2
print events[0]['source'], events[0]['target'], events[0]['code']
val = int(round(10*similar(sent1, sent2)))
if val not in similar_count:
similar_count[val] = 0
similar_count[val] = similar_count[val] + 1

from newsplease import NewsPlease

article = NewsPlease.from_url(events[0]['url'])
print events[0]['url']
print(article.text)

doc_count += 1

print doc_count
print root_code_match
print event_match

print similar_count




Expand Down
105 changes: 105 additions & 0 deletions Analysis2.py
@@ -0,0 +1,105 @@
import json
import requests
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

from difflib import SequenceMatcher

def similar(a, b):
return SequenceMatcher(None, a, b).ratio()

url = "http://eventdata.utdallas.edu/api/data?api_key=EmNc8Pbp5XEUIuzlIdxqVlP5g6S1KlNe&query={\"date8\":{\"$gt\":\"20180228\", \"$lt\": \"20180401\"}}"

response = requests.get(url)

print response

data = json.loads(response.content)

#print response.content

print "Data Loading Complete. Entry count ", len(data["data"])

document_to_event_map = {}

for event in data['data']:

doc_id = event["id"].split("_")[0]
if doc_id not in document_to_event_map:
document_to_event_map[doc_id] = []

document_to_event_map[doc_id].append(event)


print len(document_to_event_map)

count_map = {}

for doc in document_to_event_map:
if len(document_to_event_map[doc]) not in count_map:
count_map[len(document_to_event_map[doc])] = 0
count_map[len(document_to_event_map[doc])] += 1


print count_map

root_code_match = 0
root_code_not_found = 0
event_match = 0
doc_count = 0
output_file = open("events.txt", "w+")

similar_count = {}
with open("output.txt", "w+") as out:
for doc_id in document_to_event_map:
if len(document_to_event_map[doc_id]) == 3:
events = document_to_event_map[doc_id]
#print events[0]
if 'source' not in events[0] or 'target' not in events[0]:
continue

if 'source' not in events[1] or 'target' not in events[1]:
continue

if 'source' not in events[2] or 'target' not in events[2]:
continue

url = "http://eventdata.utdallas.edu/api/article?api_key=EmNc8Pbp5XEUIuzlIdxqVlP5g6S1KlNe&doc_id=" + doc_id

response = requests.get(url)

data = json.loads(response.content)
sentences = data['data']

for i in range(0, len(events)):
print >> out, events[i]['code'], events[i]['source'], events[i]['target']
sent_id = events[i]['id'].split("_")[1]
j = 0
while sent_id != str(sentences[j]['sentence_id']):
j += 1
print >> out, sent_id, ": ", sentences[j]["sentence"]

from newsplease import NewsPlease

print >> out, "================= FULL ARTICLE ==============="
article = NewsPlease.from_url(events[0]['url'])
print >> out, events[0]['url']
print >> out, article.text

doc_count += 1
out.close()
print doc_count
print root_code_match
print event_match

print similar_count









2 changes: 1 addition & 1 deletion app_v2.py
Expand Up @@ -129,7 +129,7 @@ def query_formatter(query):

def __get_mongo_connection():
# For local debugging
MONGO_SERVER_IP = "172.29.100.22"
MONGO_SERVER_IP = "172.29.100.16"
MONGO_PORT = "3154"
MONGO_USER = "event_reader"
MONGO_PSWD = "dml2016"
Expand Down
42 changes: 42 additions & 0 deletions newsplease/NewsArticle.py
@@ -0,0 +1,42 @@
class NewsArticle(object):
"""
Class representing a single news article containing all the information that news-please can extract.
"""
authors = []
date_download = None
date_modify = None
date_publish = None
description = None
filename = None
image_url = None
language = None
localpath = None
source_domain = None
text = None
title = None
title_page = None
title_rss = None
url = None

def get_dict(self):
"""
Get the dict of the instance of this class.
:return:
"""
return {
'authors': self.authors,
'date_download': self.date_download,
'date_modify': self.date_modify,
'date_publish': self.date_publish,
'description': self.description,
'filename': self.filename,
'image_url': self.image_url,
'language': self.language,
'localpath': self.localpath,
'source_domain': self.source_domain,
'text': self.text,
'title': self.title,
'title_page': self.title_page,
'title_rss': self.title_rss,
'url': self.url
}
121 changes: 121 additions & 0 deletions newsplease/__init__.py
@@ -0,0 +1,121 @@
import datetime
import os
import sys
import urllib

from six.moves import urllib

sys.path.append(os.path.dirname(os.path.realpath(__file__)))

from newsplease.pipeline.extractor import article_extractor
from newsplease.crawler.items import NewscrawlerItem
from dotmap import DotMap
from newsplease.pipeline.pipelines import ExtractedInformationStorage
from newsplease.crawler.simple_crawler import SimpleCrawler


class NewsPlease:
"""
Access news-please functionality via this interface
"""

@staticmethod
def from_warc(warc_record):
"""
Extracts relevant information from a WARC record. This function does not invoke scrapy but only uses the article
extractor.
:return:
"""
html = str(warc_record.raw_stream.read())
url = warc_record.rec_headers.get_header('WARC-Target-URI')
download_date = warc_record.rec_headers.get_header('WARC-Date')
article = NewsPlease.from_html(html, url=url, download_date=download_date)
return article

@staticmethod
def from_html(html, url=None, download_date=None):
"""
Extracts relevant information from an HTML page given as a string. This function does not invoke scrapy but only
uses the article extractor. If you have the original URL make sure to provide it as this helps NewsPlease
to extract the publishing date and title.
:param html:
:param url:
:return:
"""
extractor = article_extractor.Extractor(
['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor'])

title_encoded = ''.encode()
if not url:
url = ''

# if an url was given, we can use that as the filename
filename = urllib.parse.quote_plus(url) + '.json'

item = NewscrawlerItem()
item['spider_response'] = DotMap()
item['spider_response'].body = html
item['url'] = url
item['source_domain'] = urllib.parse.urlparse(url).hostname.encode() if url != '' else ''.encode()
item['html_title'] = title_encoded
item['rss_title'] = title_encoded
item['local_path'] = None
item['filename'] = filename
item['download_date'] = download_date
item['modified_date'] = None
item = extractor.extract(item)

tmp_article = ExtractedInformationStorage.extract_relevant_info(item)
final_article = ExtractedInformationStorage.convert_to_class(tmp_article)
# final_article = DotMap(tmp_article)
return final_article

@staticmethod
def from_url(url):
"""
Crawls the article from the url and extracts relevant information.
:param url:
:return: A dict containing all the information of the article. Else, None.
"""
articles = NewsPlease.from_urls([url])
if url in articles.keys():
return articles[url]
else:
return None

@staticmethod
def from_urls(urls):
"""
Crawls articles from the urls and extracts relevant information.
:param urls:
:return: A dict containing given URLs as keys, and extracted information as corresponding values.
"""
results = {}
download_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

if len(urls) == 0:
pass
elif len(urls) == 1:
url = urls[0]
html = SimpleCrawler.fetch_url(url)
results[url] = NewsPlease.from_html(html, url, download_date)
else:
results = SimpleCrawler.fetch_urls(urls)
for url in results:
results[url] = NewsPlease.from_html(results[url], url, download_date)

return results

@staticmethod
def from_file(path):
"""
Crawls articles from the urls and extracts relevant information.
:param path: path to file containing urls (each line contains one URL)
:return: A dict containing given URLs as keys, and extracted information as corresponding values.
"""
with open(path) as f:
content = f.readlines()
content = [x.strip() for x in content]
urls = list(filter(None, content))

return NewsPlease.from_urls(urls)

0 comments on commit d0968bf

Please sign in to comment.