Changes mongo server IP

eventdata · May 14, 2018 · d0968bf · d0968bf
1 parent 5d2e97e
commit d0968bf
Show file tree

Hide file tree

Showing 52 changed files with 5,810 additions and 2 deletions.
diff --git a/Analysis.py b/Analysis.py
@@ -1,6 +1,10 @@
 import json
 import requests
 
+from difflib import SequenceMatcher
+
+def similar(a, b):
+    return SequenceMatcher(None, a, b).ratio()
 
 url = "http://eventdata.utdallas.edu/api/data?api_key=EmNc8Pbp5XEUIuzlIdxqVlP5g6S1KlNe&query={\"date8\":{\"$gt\":\"20180228\", \"$lt\": \"20180401\"}}"
 
@@ -41,6 +45,9 @@
 root_code_not_found = 0
 event_match = 0
 doc_count = 0
+output_file = open("events.txt", "w+")
+
+similar_count = {}
 
 for doc_id in document_to_event_map:
     if len(document_to_event_map[doc_id]) == 2:
@@ -64,7 +71,7 @@
 
                     response = requests.get(url)
 
-                    print response.content
+                    #print response.content
 
                     data = json.loads(response.content)
                     sentences = data['data']
@@ -80,15 +87,25 @@
                     print sent1_id, ":", sent1
                     print sent2_id, ":", sent2
                     print events[0]['source'], events[0]['target'], events[0]['code']
+                    val = int(round(10*similar(sent1, sent2)))
+                    if val not in similar_count:
+                        similar_count[val] = 0
+                    similar_count[val] = similar_count[val] + 1
 
+                    from newsplease import NewsPlease
 
+                    article = NewsPlease.from_url(events[0]['url'])
+                    print events[0]['url']
+                    print(article.text)
 
         doc_count += 1
 
 print doc_count
 print root_code_match
 print event_match
 
+print similar_count
+
 
 
 

diff --git a/Analysis2.py b/Analysis2.py
@@ -0,0 +1,105 @@
+import json
+import requests
+import sys
+reload(sys)
+sys.setdefaultencoding("utf-8")
+
+from difflib import SequenceMatcher
+
+def similar(a, b):
+    return SequenceMatcher(None, a, b).ratio()
+
+url = "http://eventdata.utdallas.edu/api/data?api_key=EmNc8Pbp5XEUIuzlIdxqVlP5g6S1KlNe&query={\"date8\":{\"$gt\":\"20180228\", \"$lt\": \"20180401\"}}"
+
+response = requests.get(url)
+
+print response
+
+data = json.loads(response.content)
+
+#print response.content
+
+print "Data Loading Complete. Entry count ", len(data["data"])
+
+document_to_event_map = {}
+
+for event in data['data']:
+
+    doc_id = event["id"].split("_")[0]
+    if doc_id not in document_to_event_map:
+        document_to_event_map[doc_id] = []
+
+    document_to_event_map[doc_id].append(event)
+
+
+print len(document_to_event_map)
+
+count_map = {}
+
+for doc in document_to_event_map:
+    if len(document_to_event_map[doc]) not in count_map:
+        count_map[len(document_to_event_map[doc])] = 0
+    count_map[len(document_to_event_map[doc])] += 1
+
+
+print count_map
+
+root_code_match = 0
+root_code_not_found = 0
+event_match = 0
+doc_count = 0
+output_file = open("events.txt", "w+")
+
+similar_count = {}
+with open("output.txt", "w+") as out:
+    for doc_id in document_to_event_map:
+        if len(document_to_event_map[doc_id]) == 3:
+            events = document_to_event_map[doc_id]
+            #print events[0]
+            if 'source' not in events[0] or 'target' not in events[0]:
+                continue
+
+            if 'source' not in events[1] or 'target' not in events[1]:
+                continue
+
+            if 'source' not in events[2] or 'target' not in events[2]:
+                continue
+
+            url = "http://eventdata.utdallas.edu/api/article?api_key=EmNc8Pbp5XEUIuzlIdxqVlP5g6S1KlNe&doc_id=" + doc_id
+
+            response = requests.get(url)
+
+            data = json.loads(response.content)
+            sentences = data['data']
+
+            for i in range(0, len(events)):
+                print >> out, events[i]['code'], events[i]['source'], events[i]['target']
+                sent_id = events[i]['id'].split("_")[1]
+                j = 0
+                while sent_id != str(sentences[j]['sentence_id']):
+                    j += 1
+                print >> out, sent_id, ": ", sentences[j]["sentence"]
+
+            from newsplease import NewsPlease
+
+            print >> out, "================= FULL ARTICLE ==============="
+            article = NewsPlease.from_url(events[0]['url'])
+            print >> out, events[0]['url']
+            print >> out, article.text
+
+            doc_count += 1
+out.close()
+print doc_count
+print root_code_match
+print event_match
+
+print similar_count
+
+
+
+
+
+
+
+
+
diff --git a/app_v2.py b/app_v2.py
@@ -129,7 +129,7 @@ def query_formatter(query):
 
 def __get_mongo_connection():
     # For local debugging
-    MONGO_SERVER_IP = "172.29.100.22"
+    MONGO_SERVER_IP = "172.29.100.16"
     MONGO_PORT = "3154"
     MONGO_USER = "event_reader"
     MONGO_PSWD = "dml2016"

diff --git a/newsplease/NewsArticle.py b/newsplease/NewsArticle.py
@@ -0,0 +1,42 @@
+class NewsArticle(object):
+    """
+    Class representing a single news article containing all the information that news-please can extract.
+    """
+    authors = []
+    date_download = None
+    date_modify = None
+    date_publish = None
+    description = None
+    filename = None
+    image_url = None
+    language = None
+    localpath = None
+    source_domain = None
+    text = None
+    title = None
+    title_page = None
+    title_rss = None
+    url = None
+
+    def get_dict(self):
+        """
+        Get the dict of the instance of this class.
+        :return:
+        """
+        return {
+            'authors': self.authors,
+            'date_download': self.date_download,
+            'date_modify': self.date_modify,
+            'date_publish': self.date_publish,
+            'description': self.description,
+            'filename': self.filename,
+            'image_url': self.image_url,
+            'language': self.language,
+            'localpath': self.localpath,
+            'source_domain': self.source_domain,
+            'text': self.text,
+            'title': self.title,
+            'title_page': self.title_page,
+            'title_rss': self.title_rss,
+            'url': self.url
+        }
diff --git a/newsplease/__init__.py b/newsplease/__init__.py
@@ -0,0 +1,121 @@
+import datetime
+import os
+import sys
+import urllib
+
+from six.moves import urllib
+
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))
+
+from newsplease.pipeline.extractor import article_extractor
+from newsplease.crawler.items import NewscrawlerItem
+from dotmap import DotMap
+from newsplease.pipeline.pipelines import ExtractedInformationStorage
+from newsplease.crawler.simple_crawler import SimpleCrawler
+
+
+class NewsPlease:
+    """
+    Access news-please functionality via this interface
+    """
+
+    @staticmethod
+    def from_warc(warc_record):
+        """
+        Extracts relevant information from a WARC record. This function does not invoke scrapy but only uses the article
+        extractor.
+        :return:
+        """
+        html = str(warc_record.raw_stream.read())
+        url = warc_record.rec_headers.get_header('WARC-Target-URI')
+        download_date = warc_record.rec_headers.get_header('WARC-Date')
+        article = NewsPlease.from_html(html, url=url, download_date=download_date)
+        return article
+
+    @staticmethod
+    def from_html(html, url=None, download_date=None):
+        """
+        Extracts relevant information from an HTML page given as a string. This function does not invoke scrapy but only
+        uses the article extractor. If you have the original URL make sure to provide it as this helps NewsPlease
+        to extract the publishing date and title.
+        :param html:
+        :param url:
+        :return:
+        """
+        extractor = article_extractor.Extractor(
+            ['newspaper_extractor', 'readability_extractor', 'date_extractor', 'lang_detect_extractor'])
+
+        title_encoded = ''.encode()
+        if not url:
+            url = ''
+
+        # if an url was given, we can use that as the filename
+        filename = urllib.parse.quote_plus(url) + '.json'
+
+        item = NewscrawlerItem()
+        item['spider_response'] = DotMap()
+        item['spider_response'].body = html
+        item['url'] = url
+        item['source_domain'] = urllib.parse.urlparse(url).hostname.encode() if url != '' else ''.encode()
+        item['html_title'] = title_encoded
+        item['rss_title'] = title_encoded
+        item['local_path'] = None
+        item['filename'] = filename
+        item['download_date'] = download_date
+        item['modified_date'] = None
+        item = extractor.extract(item)
+
+        tmp_article = ExtractedInformationStorage.extract_relevant_info(item)
+        final_article = ExtractedInformationStorage.convert_to_class(tmp_article)
+        # final_article = DotMap(tmp_article)
+        return final_article
+
+    @staticmethod
+    def from_url(url):
+        """
+        Crawls the article from the url and extracts relevant information.
+        :param url:
+        :return: A dict containing all the information of the article. Else, None.
+        """
+        articles = NewsPlease.from_urls([url])
+        if url in articles.keys():
+            return articles[url]
+        else:
+            return None
+
+    @staticmethod
+    def from_urls(urls):
+        """
+        Crawls articles from the urls and extracts relevant information.
+        :param urls:
+        :return: A dict containing given URLs as keys, and extracted information as corresponding values.
+        """
+        results = {}
+        download_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+
+        if len(urls) == 0:
+            pass
+        elif len(urls) == 1:
+            url = urls[0]
+            html = SimpleCrawler.fetch_url(url)
+            results[url] = NewsPlease.from_html(html, url, download_date)
+        else:
+            results = SimpleCrawler.fetch_urls(urls)
+            for url in results:
+                results[url] = NewsPlease.from_html(results[url], url, download_date)
+
+        return results
+
+    @staticmethod
+    def from_file(path):
+        """
+        Crawls articles from the urls and extracts relevant information.
+        :param path: path to file containing urls (each line contains one URL)
+        :return: A dict containing given URLs as keys, and extracted information as corresponding values.
+        """
+        with open(path) as f:
+            content = f.readlines()
+        content = [x.strip() for x in content]
+        urls = list(filter(None, content))
+
+        return NewsPlease.from_urls(urls)