Added popular info finder

Further attempts to optimise source aggregator. Added new functions to entity processor for popular info finder. Created a unit test for the popular info finder. Took 14 hours 16 minutes
UP2040499 · Apr 28, 2023 · c087bb8 · c087bb8
1 parent 66b616b
commit c087bb8
Show file tree

Hide file tree

Showing 9 changed files with 524 additions and 76 deletions.
diff --git a/auto_osint_v/main.py b/auto_osint_v/main.py
@@ -2,13 +2,14 @@
 
 Run this file to run the tool.
 """
-
+import asyncio
 import os
 import sys
 from auto_osint_v.specific_entity_processor import EntityProcessor
 from auto_osint_v.file_handler import FileHandler
 from auto_osint_v.sentiment_analyser import SemanticAnalyser
 from auto_osint_v.source_aggregator import SourceAggregator
+from auto_osint_v.popular_information_finder import PopularInformationFinder
 
 data_file_path = os.getcwd() + "/data_files/"
 sys.path.append(
@@ -49,8 +50,8 @@ def input_bias_sources():
     intel_file = file_handler.read_file("intelligence_file.txt")
     # Entity Processor - identifies specific entities mentioned in intel statement
     print("Processing entities...")
-    process_entities = EntityProcessor(intel_file, file_handler)
-    process_entities.store_words_from_label()
+    process_entities = EntityProcessor(file_handler)
+    process_entities.store_words_from_label(intel_file)
 
     # Clean evidence_file.csv
     file_handler.clean_data_file(data_file_path + "evidence_file.csv")
@@ -66,4 +67,9 @@ def input_bias_sources():
     source_aggregator.search_query_generator()
     # Searches google and social media sites using the queries stored in source_aggregator object
     # search results will be stored in a dictionary in the source_aggregator Object.
-    source_aggregator.find_sources()
+    potential_sources = source_aggregator.find_sources()
+    # Popular information finder - using potential_corroboration.csv
+    popular_information_finder = PopularInformationFinder(file_handler, process_entities)
+
+    # get the popular information - this is a costly search (on 170 sources it takes ~15 minutes).
+    print(popular_information_finder.find_entities(potential_sources))
diff --git a/auto_osint_v/popular_information_finder.py b/auto_osint_v/popular_information_finder.py
@@ -0,0 +1,114 @@
+"""
+Finds entities (information) that is popular amongst the potentially corroborating sources.
+"""
+import requests
+import itertools
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+
+
+class PopularInformationFinder:
+    """
+    Class that provides methods that get text from sources and compares the number of times a
+    particular entity is mentioned.
+    """
+
+    def __init__(self, file_handler_object, entity_processor_object):
+        """
+
+        Args:
+            file_handler_object: gives the class access to the file_handler object.
+            entity_processor_object: gives the class access to the entity_processor object.
+
+        Returns:
+
+        """
+        self.file_handler = file_handler_object
+        self.entity_processor = entity_processor_object
+
+    def get_text_process_entities(self, url, entities):
+        """Gets the body text from each source using its URL.
+
+        Uses requests and BeautifulSoup to retrieve and parse the webpage's HTML into a readable
+        format for entity recognition.
+
+        Args:
+            entities: the dictionary of entities.
+            url: url fetched from sources dictionary.
+
+        Returns:
+            The content of the webpage in UTF-8 format.
+        """
+        # set headers to try to avoid 403 errors
+        headers = {
+            'User-Agent':
+                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
+                'Chrome/112.0.0.0 Safari/537.36'}
+        # request the webpage
+        response = requests.get(url, headers)
+        # check if we are wasting our time with a broken or inaccessible website
+        try:
+            response.raise_for_status()
+        except requests.HTTPError:
+            return entities
+        # get the html from the response
+        html = response.text
+        # parse HTML using BeautifulSoup
+        soup = BeautifulSoup(html, "html.parser")
+
+        # kill all script and style elements
+        for script in soup(["script", "style"]):
+            script.extract()  # rip it out
+
+        # get text
+        text = soup.get_text()
+
+        # this doesn't work getting random webpage bits in the entities
+        # break into lines and remove leading and trailing space on each
+        lines = (line.strip() for line in text.splitlines())
+        # break multi-headlines into a line each
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        # drop blank lines
+        text = '\n'.join(chunk for chunk in chunks if chunk)
+
+        if len(text) <= 100000:
+            # run the text through the entity processor. stores entities in namesake variable
+            entities = self.entity_processor.get_entities_and_count(text, entities)
+        # else print/save source that has been skipped
+
+        return entities
+
+    def find_entities(self, sources):
+        """Finds entities in the given text.
+
+        Uses the same model for entity recognition in specific_entity_processor.
+
+        Looks like we need to scrap wikipedia articles because they are too long.
+        Articles over 100k characters are probably too long also.
+        Most slowdowns here have been due to Russia's wikipedia page.
+
+        Args:
+            sources: list of dictionaries of sources with corresponding URL.
+
+        Returns:
+            A list of the most popular words amongst all the sources.
+        """
+        entities = {}
+
+        for i, source in enumerate(tqdm(sources, desc="Getting text and finding entities")):
+            # get the text from each source and find the entities
+            entities = self.get_text_process_entities(source["url"], entities)
+
+        # entities = dict(map(self.get_text_process_entities, sources, entities))
+
+        # sort dictionary by highest no. of mentions.
+        # lambda function specifies sorted to use the values of the dictionary in desc. order
+        sorted_entities = sorted(entities.items(), key=lambda x: x[1], reverse=True)
+        # keep top 2.5% of words - this is an arbitrary value, not sure what value is best.
+        # using itertools to slice the dictionary
+        cut_off_value = int(len(sorted_entities) * 0.025)
+        sorted_entities = itertools.islice(sorted_entities, cut_off_value)
+        # return the list of words
+        sorted_entities_words = list(word for (word, count) in sorted_entities)
+
+        return sorted_entities_words
diff --git a/auto_osint_v/source_aggregator.py b/auto_osint_v/source_aggregator.py
@@ -83,7 +83,7 @@ def searcher(search_term, **kwargs):
         """Using the Google Custom Search Engine to search for results to the search_term.
 
         Args:
-            search_term: The keyword/query to search for
+            search_term: The keyword/query to search for. This can be a string or a list of strings.
             kwargs: Extra arguments to pass to service.cse().list
 
         Returns:
@@ -93,7 +93,7 @@ def searcher(search_term, **kwargs):
         api_key = "AIzaSyCgsni4yZyp4Bla9J7a2TE-lxmzVagcjEo"
         cse_id = "d76b2d8504d104aa8"
         service = build("customsearch", "v1", developerKey=api_key)
-        res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
+        res = service.cse().list(q=search_term, cx=cse_id, hl='en', **kwargs).execute()
         try:
             return res['items']
         except KeyError:
@@ -110,16 +110,17 @@ def google_search(self):
             dictionary of Google search results
         """
         # searches google using the generated queries
-        for query in tqdm(self.queries, desc="Search Google using generated queries"):
-            # one search per query
-            query_results = self.searcher(query, num=5)
-            for result in query_results:
-                # write link to dict
-                self.process_result(result)
-        # Join the list of keywords/phrases into one string seperated by '|' and surrounded by ""
-        join_keywords = '|'.join(f'"{word}"' for word in self.keywords)
-        # Get the results from one query using the list of keywords
-        keyword_results = self.searcher(f"(intext:{join_keywords})", num=10)
+        query_results = self.searcher(self.queries, num=10)
+        for result in tqdm(query_results, desc="Search Google using generated queries"):
+            # write link to dict
+            self.process_result(result)
+        # search for the keywords, only 7 at a time
+        keyword_results = []
+        length_of_split = 7
+        split_keywords = [self.keywords[i:i + length_of_split]
+                          for i in range(0, len(self.keywords), length_of_split)]
+        for keywords in split_keywords:
+            keyword_results += self.searcher(keywords, num=10//len(split_keywords))
         # loop through results
         for result in tqdm(keyword_results, desc="Search Google using extracted keywords"):
             # write link to dict
@@ -132,14 +133,8 @@ def social_media_search(self):
 
         WARNING: To search using generated queries and extracted keywords, the code has nested for
         loops.
-        The 'num' argument for self.searcher method must be kept as is, otherwise performance will
-        be impacted.
-        Default performance:
-        19 social media sites
-        5 generated queries
-        2 search results per query
-        10 search results for all extracted keywords
-        total iterations until completion = 19*((5*2)+10) = 380
+        Significant performance boost achieved by finding out that the 'q' parameter for cse.list
+        takes lists as well as strings.
 
         Returns:
             dictionary storing the social media results
@@ -148,23 +143,31 @@ def social_media_search(self):
         social_media_sites = ["www.instagram.com", "www.tiktok.com", "www.facebook.com",
                               "www.youtube.com", "www.reddit.com", "www.twitter.com",
                               "www.pinterest.com", "www.github.com", "www.tumblr.com",
-                              "www.flickr.com", "www.steamcommunity.com", "vimeo.com",
+                              "www.flickr.com", "vimeo.com", "www.telegram.com"
                               "medium.com", "vk.com", "imgur.com", "www.patreon.com",
                               "bitbucket.org", "www.dailymotion.com", "news.ycombinator.com"]
         # Join the list of keywords/phrases into one string seperated by '|' and surrounded by ""
+        # it appears that the max number of comparisons is between 7 and 10.
+        # google documentation says it should be 10
         join_keywords = '|'.join(f'"{word}"' for word in self.keywords)
         # Loop through list of social media sites
         for site in tqdm(social_media_sites, desc="Searching Social Media Sites"):
             # this for loop is clearly inefficient, I don't know how to improve it
-            for query in self.queries:
-                # one search per query
-                query_results = self.searcher(query, num=2)
-                for result in query_results:
-                    # write link to dict
-                    self.process_result(result)
-            # search for the keywords using one google query
-            keyword_results = self.searcher(f"(site:{site}) (intext:{join_keywords})", num=10)
+            # I'm unsure of this behaviour as the siteSearch parameter doesn't seem to work
+            query_results = self.searcher(self.queries, siteSearch=site, siteSearchFilter='i',
+                                          num=5)
             # loop through results
+            for result in query_results:
+                # write link to dict
+                self.process_result(result)
+            # search for the keywords, only 7 at a time
+            keyword_results = []
+            length_of_split = 7
+            split_keywords = [self.keywords[i:i + length_of_split]
+                              for i in range(0, len(self.keywords), length_of_split)]
+            for keywords in split_keywords:
+                keyword_results += self.searcher(keywords, siteSearch=site,
+                                                 siteSearchFilter='i', num=5)
             for result in keyword_results:
                 # get process the result
                 self.process_result(result)
@@ -234,6 +237,7 @@ def find_sources(self):
         self.social_media_search()
         # store potentially corroborating sources in .csv file
         self.file_handler.create_potential_corroboration_file(self.results_list_dict)
+        return self.results_list_dict
 
     # Media Processor
     # interrogate each link and return a description of the media
@@ -298,7 +302,7 @@ def media_finder(self, url):
             The info we want: website title, description, images & videos
         """
         # retrieve html from URL
-        response = requests.get(url, timeout=10)    # timeout 10 seconds
+        response = requests.get(url, timeout=10)  # timeout 10 seconds
 
         soup = BeautifulSoup(response.text, "html.parser")
         # image and video tags may not be in the website.
@@ -315,30 +319,3 @@ def media_finder(self, url):
         except KeyError:
             iframes = []
         return images, videos, iframes
-
-    # Key information generator (likely using a BERT QA model)
-    # need to keep in mind the resource cost of processing, given time and resource costs are
-    # already high.
-
-    # discarded for now as processing cost is too high, causes each URL lookup to take over a minute
-    # *per url*, therefore these methods cannot be included in their current state.
-    """
-    @staticmethod
-    def url_get_text(url):
-        page = requests.get(url, timeout=10)
-        soup = BeautifulSoup(page.content, "html.parser")
-        return soup.get_text(strip=True)
-
-    def web_summary(self, url):
-        text = self.url_get_text(url)
-        tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")
-        model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-large")
-
-        inputs = tokenizer(text, truncation=True, return_tensors="pt")
-
-        # Generate summary
-        summary_ids = model.generate(inputs["input_ids"], max_new_tokens=1024)
-        summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True,
-                                         clean_up_tokenization_spaces=False)[0]
-        return summary
-        """
diff --git a/auto_osint_v/specific_entity_processor.py b/auto_osint_v/specific_entity_processor.py
@@ -4,12 +4,14 @@
 stored in appropriate stores.
 Subprocesses to this module attempt to interrogate some of this information.
 """
+import os
 
 import spacy
 
 
 # Load the best model trained using Google Colab
-NER = spacy.load("auto_osint_v/NER_training_testing/train/model/model-best-from-colab")
+NER = spacy.load(os.getcwd() + "/NER_training_testing/train/model/model-best-from-colab")
+NER.add_pipe('sentencizer')
 
 
 class EntityProcessor:
@@ -18,27 +20,31 @@ class EntityProcessor:
     It provides methods for recognising the individual entities in a statement and storing
     them appropriately.
     """
-    def __init__(self, read_statement, file_handler_object):
+    def __init__(self, file_handler_object):
         """Initialises variables to be used in this object.
 
         Args:
-         read_statement: the statement read from a text file
+         file_handler_object: the file handler to be used for file IO operations
         """
-        self.statement = read_statement
         self.file_handler = file_handler_object
+        self.irrelevant_words = ["it", "them", "they", "the", "he", "she", "his", "her" "we", "i",
+                                 "us", "me", "my", "here", "our"]
 
-    def store_words_from_label(self):
+    def store_words_from_label(self, read_statement):
         """This function stores recognised words in csv files
 
         These files are associated with the label given to
         the word.
 
+        Args:
+            read_statement: the intelligence statement read into current python instance
+
         Returns
-         Nothing - stores info in files
+            Nothing - stores info in files
         """
         # Clean any leftover files from previous runs
         self.file_handler.clean_directory("data_files/target_info_files")
-        text1 = NER(self.statement)
+        text1 = NER(read_statement)
 
         # changes added to eliminate duplicates and count number of mentions
         # define list of words present
@@ -59,3 +65,40 @@ def store_words_from_label(self):
             # Opens the relevant (based on word label) csv file and store the word text
             # and number of mentions.
             self.file_handler.open_label_file(label, text, mentions=mentions)
+
+    def get_entities_and_count(self, text, entity_dict):
+        """Finds the entities from the given text. If they appear multiple times, increment value.
+
+        This only increments words one time per source. Only count independent mentions of entities.
+
+        Args:
+            text: The text to find and count entities from.
+            entity_dict: The dictionary to store these entities and their respective counts in.
+
+        Returns:
+            entity_dict modified with new entries.
+        """
+        texts_length = len(text)
+
+        # split the text by factors of 100,000
+        split_text = [text[i:i + 100000] for i in range(0, len(text), 100000)]
+        entity_dict = self.add_entities_to_dict(entity_dict, split_text)
+
+        return entity_dict
+
+    def add_entities_to_dict(self, entity_dict, texts):
+        words_present = []
+        # just add entities to dictionary as each key needs to be unique.
+        for doc in NER.pipe(texts):
+            for ent in doc.ents:
+                # set to lowercase for easy comparison
+                key = ent.text.lower()
+                # if the entity has not already been counted and is not an irrelevant word
+                if (key not in words_present) and (key not in self.irrelevant_words):
+                    try:
+                        entity_dict[key] += 1
+                    except KeyError:
+                        entity_dict[key] = 1
+                    words_present.append(key)
+
+        return entity_dict
diff --git a/auto_osint_v/unit_tests/__init__.py b/auto_osint_v/unit_tests/__init__.py