Skip to content

Commit

Permalink
Added popular info finder
Browse files Browse the repository at this point in the history
Further attempts to optimise source aggregator.
Added new functions to entity processor for popular info finder.
Created a unit test for the popular info finder.

Took 14 hours 16 minutes
  • Loading branch information
UP2040499 committed Apr 28, 2023
1 parent 66b616b commit c087bb8
Show file tree
Hide file tree
Showing 9 changed files with 524 additions and 76 deletions.
14 changes: 10 additions & 4 deletions auto_osint_v/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
Run this file to run the tool.
"""

import asyncio
import os
import sys
from auto_osint_v.specific_entity_processor import EntityProcessor
from auto_osint_v.file_handler import FileHandler
from auto_osint_v.sentiment_analyser import SemanticAnalyser
from auto_osint_v.source_aggregator import SourceAggregator
from auto_osint_v.popular_information_finder import PopularInformationFinder

data_file_path = os.getcwd() + "/data_files/"
sys.path.append(
Expand Down Expand Up @@ -49,8 +50,8 @@ def input_bias_sources():
intel_file = file_handler.read_file("intelligence_file.txt")
# Entity Processor - identifies specific entities mentioned in intel statement
print("Processing entities...")
process_entities = EntityProcessor(intel_file, file_handler)
process_entities.store_words_from_label()
process_entities = EntityProcessor(file_handler)
process_entities.store_words_from_label(intel_file)

# Clean evidence_file.csv
file_handler.clean_data_file(data_file_path + "evidence_file.csv")
Expand All @@ -66,4 +67,9 @@ def input_bias_sources():
source_aggregator.search_query_generator()
# Searches google and social media sites using the queries stored in source_aggregator object
# search results will be stored in a dictionary in the source_aggregator Object.
source_aggregator.find_sources()
potential_sources = source_aggregator.find_sources()
# Popular information finder - using potential_corroboration.csv
popular_information_finder = PopularInformationFinder(file_handler, process_entities)

# get the popular information - this is a costly search (on 170 sources it takes ~15 minutes).
print(popular_information_finder.find_entities(potential_sources))
114 changes: 114 additions & 0 deletions auto_osint_v/popular_information_finder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
"""
Finds entities (information) that is popular amongst the potentially corroborating sources.
"""
import requests
import itertools
from bs4 import BeautifulSoup
from tqdm import tqdm


class PopularInformationFinder:
"""
Class that provides methods that get text from sources and compares the number of times a
particular entity is mentioned.
"""

def __init__(self, file_handler_object, entity_processor_object):
"""
Args:
file_handler_object: gives the class access to the file_handler object.
entity_processor_object: gives the class access to the entity_processor object.
Returns:
"""
self.file_handler = file_handler_object
self.entity_processor = entity_processor_object

def get_text_process_entities(self, url, entities):
"""Gets the body text from each source using its URL.
Uses requests and BeautifulSoup to retrieve and parse the webpage's HTML into a readable
format for entity recognition.
Args:
entities: the dictionary of entities.
url: url fetched from sources dictionary.
Returns:
The content of the webpage in UTF-8 format.
"""
# set headers to try to avoid 403 errors
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/112.0.0.0 Safari/537.36'}
# request the webpage
response = requests.get(url, headers)
# check if we are wasting our time with a broken or inaccessible website
try:
response.raise_for_status()
except requests.HTTPError:
return entities
# get the html from the response
html = response.text
# parse HTML using BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out

# get text
text = soup.get_text()

# this doesn't work getting random webpage bits in the entities
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)

if len(text) <= 100000:
# run the text through the entity processor. stores entities in namesake variable
entities = self.entity_processor.get_entities_and_count(text, entities)
# else print/save source that has been skipped

return entities

def find_entities(self, sources):
"""Finds entities in the given text.
Uses the same model for entity recognition in specific_entity_processor.
Looks like we need to scrap wikipedia articles because they are too long.
Articles over 100k characters are probably too long also.
Most slowdowns here have been due to Russia's wikipedia page.
Args:
sources: list of dictionaries of sources with corresponding URL.
Returns:
A list of the most popular words amongst all the sources.
"""
entities = {}

for i, source in enumerate(tqdm(sources, desc="Getting text and finding entities")):
# get the text from each source and find the entities
entities = self.get_text_process_entities(source["url"], entities)

# entities = dict(map(self.get_text_process_entities, sources, entities))

# sort dictionary by highest no. of mentions.
# lambda function specifies sorted to use the values of the dictionary in desc. order
sorted_entities = sorted(entities.items(), key=lambda x: x[1], reverse=True)
# keep top 2.5% of words - this is an arbitrary value, not sure what value is best.
# using itertools to slice the dictionary
cut_off_value = int(len(sorted_entities) * 0.025)
sorted_entities = itertools.islice(sorted_entities, cut_off_value)
# return the list of words
sorted_entities_words = list(word for (word, count) in sorted_entities)

return sorted_entities_words
91 changes: 34 additions & 57 deletions auto_osint_v/source_aggregator.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def searcher(search_term, **kwargs):
"""Using the Google Custom Search Engine to search for results to the search_term.
Args:
search_term: The keyword/query to search for
search_term: The keyword/query to search for. This can be a string or a list of strings.
kwargs: Extra arguments to pass to service.cse().list
Returns:
Expand All @@ -93,7 +93,7 @@ def searcher(search_term, **kwargs):
api_key = "AIzaSyCgsni4yZyp4Bla9J7a2TE-lxmzVagcjEo"
cse_id = "d76b2d8504d104aa8"
service = build("customsearch", "v1", developerKey=api_key)
res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
res = service.cse().list(q=search_term, cx=cse_id, hl='en', **kwargs).execute()
try:
return res['items']
except KeyError:
Expand All @@ -110,16 +110,17 @@ def google_search(self):
dictionary of Google search results
"""
# searches google using the generated queries
for query in tqdm(self.queries, desc="Search Google using generated queries"):
# one search per query
query_results = self.searcher(query, num=5)
for result in query_results:
# write link to dict
self.process_result(result)
# Join the list of keywords/phrases into one string seperated by '|' and surrounded by ""
join_keywords = '|'.join(f'"{word}"' for word in self.keywords)
# Get the results from one query using the list of keywords
keyword_results = self.searcher(f"(intext:{join_keywords})", num=10)
query_results = self.searcher(self.queries, num=10)
for result in tqdm(query_results, desc="Search Google using generated queries"):
# write link to dict
self.process_result(result)
# search for the keywords, only 7 at a time
keyword_results = []
length_of_split = 7
split_keywords = [self.keywords[i:i + length_of_split]
for i in range(0, len(self.keywords), length_of_split)]
for keywords in split_keywords:
keyword_results += self.searcher(keywords, num=10//len(split_keywords))
# loop through results
for result in tqdm(keyword_results, desc="Search Google using extracted keywords"):
# write link to dict
Expand All @@ -132,14 +133,8 @@ def social_media_search(self):
WARNING: To search using generated queries and extracted keywords, the code has nested for
loops.
The 'num' argument for self.searcher method must be kept as is, otherwise performance will
be impacted.
Default performance:
19 social media sites
5 generated queries
2 search results per query
10 search results for all extracted keywords
total iterations until completion = 19*((5*2)+10) = 380
Significant performance boost achieved by finding out that the 'q' parameter for cse.list
takes lists as well as strings.
Returns:
dictionary storing the social media results
Expand All @@ -148,23 +143,31 @@ def social_media_search(self):
social_media_sites = ["www.instagram.com", "www.tiktok.com", "www.facebook.com",
"www.youtube.com", "www.reddit.com", "www.twitter.com",
"www.pinterest.com", "www.github.com", "www.tumblr.com",
"www.flickr.com", "www.steamcommunity.com", "vimeo.com",
"www.flickr.com", "vimeo.com", "www.telegram.com"
"medium.com", "vk.com", "imgur.com", "www.patreon.com",
"bitbucket.org", "www.dailymotion.com", "news.ycombinator.com"]
# Join the list of keywords/phrases into one string seperated by '|' and surrounded by ""
# it appears that the max number of comparisons is between 7 and 10.
# google documentation says it should be 10
join_keywords = '|'.join(f'"{word}"' for word in self.keywords)
# Loop through list of social media sites
for site in tqdm(social_media_sites, desc="Searching Social Media Sites"):
# this for loop is clearly inefficient, I don't know how to improve it
for query in self.queries:
# one search per query
query_results = self.searcher(query, num=2)
for result in query_results:
# write link to dict
self.process_result(result)
# search for the keywords using one google query
keyword_results = self.searcher(f"(site:{site}) (intext:{join_keywords})", num=10)
# I'm unsure of this behaviour as the siteSearch parameter doesn't seem to work
query_results = self.searcher(self.queries, siteSearch=site, siteSearchFilter='i',
num=5)
# loop through results
for result in query_results:
# write link to dict
self.process_result(result)
# search for the keywords, only 7 at a time
keyword_results = []
length_of_split = 7
split_keywords = [self.keywords[i:i + length_of_split]
for i in range(0, len(self.keywords), length_of_split)]
for keywords in split_keywords:
keyword_results += self.searcher(keywords, siteSearch=site,
siteSearchFilter='i', num=5)
for result in keyword_results:
# get process the result
self.process_result(result)
Expand Down Expand Up @@ -234,6 +237,7 @@ def find_sources(self):
self.social_media_search()
# store potentially corroborating sources in .csv file
self.file_handler.create_potential_corroboration_file(self.results_list_dict)
return self.results_list_dict

# Media Processor
# interrogate each link and return a description of the media
Expand Down Expand Up @@ -298,7 +302,7 @@ def media_finder(self, url):
The info we want: website title, description, images & videos
"""
# retrieve html from URL
response = requests.get(url, timeout=10) # timeout 10 seconds
response = requests.get(url, timeout=10) # timeout 10 seconds

soup = BeautifulSoup(response.text, "html.parser")
# image and video tags may not be in the website.
Expand All @@ -315,30 +319,3 @@ def media_finder(self, url):
except KeyError:
iframes = []
return images, videos, iframes

# Key information generator (likely using a BERT QA model)
# need to keep in mind the resource cost of processing, given time and resource costs are
# already high.

# discarded for now as processing cost is too high, causes each URL lookup to take over a minute
# *per url*, therefore these methods cannot be included in their current state.
"""
@staticmethod
def url_get_text(url):
page = requests.get(url, timeout=10)
soup = BeautifulSoup(page.content, "html.parser")
return soup.get_text(strip=True)
def web_summary(self, url):
text = self.url_get_text(url)
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-large")
inputs = tokenizer(text, truncation=True, return_tensors="pt")
# Generate summary
summary_ids = model.generate(inputs["input_ids"], max_new_tokens=1024)
summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True,
clean_up_tokenization_spaces=False)[0]
return summary
"""
57 changes: 50 additions & 7 deletions auto_osint_v/specific_entity_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
stored in appropriate stores.
Subprocesses to this module attempt to interrogate some of this information.
"""
import os

import spacy


# Load the best model trained using Google Colab
NER = spacy.load("auto_osint_v/NER_training_testing/train/model/model-best-from-colab")
NER = spacy.load(os.getcwd() + "/NER_training_testing/train/model/model-best-from-colab")
NER.add_pipe('sentencizer')


class EntityProcessor:
Expand All @@ -18,27 +20,31 @@ class EntityProcessor:
It provides methods for recognising the individual entities in a statement and storing
them appropriately.
"""
def __init__(self, read_statement, file_handler_object):
def __init__(self, file_handler_object):
"""Initialises variables to be used in this object.
Args:
read_statement: the statement read from a text file
file_handler_object: the file handler to be used for file IO operations
"""
self.statement = read_statement
self.file_handler = file_handler_object
self.irrelevant_words = ["it", "them", "they", "the", "he", "she", "his", "her" "we", "i",
"us", "me", "my", "here", "our"]

def store_words_from_label(self):
def store_words_from_label(self, read_statement):
"""This function stores recognised words in csv files
These files are associated with the label given to
the word.
Args:
read_statement: the intelligence statement read into current python instance
Returns
Nothing - stores info in files
Nothing - stores info in files
"""
# Clean any leftover files from previous runs
self.file_handler.clean_directory("data_files/target_info_files")
text1 = NER(self.statement)
text1 = NER(read_statement)

# changes added to eliminate duplicates and count number of mentions
# define list of words present
Expand All @@ -59,3 +65,40 @@ def store_words_from_label(self):
# Opens the relevant (based on word label) csv file and store the word text
# and number of mentions.
self.file_handler.open_label_file(label, text, mentions=mentions)

def get_entities_and_count(self, text, entity_dict):
"""Finds the entities from the given text. If they appear multiple times, increment value.
This only increments words one time per source. Only count independent mentions of entities.
Args:
text: The text to find and count entities from.
entity_dict: The dictionary to store these entities and their respective counts in.
Returns:
entity_dict modified with new entries.
"""
texts_length = len(text)

# split the text by factors of 100,000
split_text = [text[i:i + 100000] for i in range(0, len(text), 100000)]
entity_dict = self.add_entities_to_dict(entity_dict, split_text)

return entity_dict

def add_entities_to_dict(self, entity_dict, texts):
words_present = []
# just add entities to dictionary as each key needs to be unique.
for doc in NER.pipe(texts):
for ent in doc.ents:
# set to lowercase for easy comparison
key = ent.text.lower()
# if the entity has not already been counted and is not an irrelevant word
if (key not in words_present) and (key not in self.irrelevant_words):
try:
entity_dict[key] += 1
except KeyError:
entity_dict[key] = 1
words_present.append(key)

return entity_dict
Empty file.
Loading

0 comments on commit c087bb8

Please sign in to comment.