# Automated Threat Identidication and Assessment Module
This is the Threat Identidication and Assessment Module that pulls data from selected resources on the internet and use different techniques to clean and get data to output threat level with ML tools like MonkeyLearn and Spacy.

**NOTE**: Some sources have threats that have already been assessed so will directly give an output skipping the threat level classification stage

In [1]:
# Library imports
import csv
import re
import json
import bs4
import spacy
import requests
import en_core_web_lg
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
from datetime import datetime, date
from IPython.display import clear_output
from monkeylearn import MonkeyLearn

In [2]:
# MonkeyLearn API key and Model ID
ml = MonkeyLearn('2b14f564c495a96ece74dc80df73514efb517d6c')
model_id = 'cl_M7RxATri'

In [3]:
# Custom Function to get the tag_name classification
def getLvl(text):
    data = [text]
    result = ml.classifiers.classify(model_id, data)
    if(len(result.body[0]['classifications']) > 0):
        return result.body[0]['classifications'][0]['tag_name']

In [4]:
# Loading spacy english model for entity identification
nlp = en_core_web_lg.load() 

In [14]:
# Custom funcation to merge locations identified using SpaCy's "en" model into one string
def getLoc(text):
    matches = ["GPE", "LOC", "FAC"]
    doc = nlp(text)
    G, F, L = "", "", ""

    for e in doc.ents:
        if "GPE" in e.label_:
            if(G == ""):
                G = e.text
            if(G != ""):
                if(e.text not in G):
                    G = (G + " " + e.text)
                
        if "LOC" in e.label_:
            if(L == ""):
                L = e.text
            if(L != ""):
                if(e.text not in L):
                    L = (L + " " + e.text)
                
        if "FAC" in e.label_:
            if(F == ""):
                F = e.text
            if(F != ""):
                if(e.text not in F):
                    F = (e.text + " " + F)
                    
    return (F + " " + L + " " + G)

In [16]:
# SpaCy Entity recognition
location1 = getLoc("4 tested positive in Sydney").lstrip()
location2 = getLoc(u"""Macquarie University is a public research university based in Sydney"
                    ", Australia, in the suburb of Macquarie Park.""").lstrip()
print("Location(s) in first string: ", location1)
print("Location(s) in second string: ",location2)

Location(s) in first string:  Sydney
Location(s) in second string:  Sydney Australia Macquarie Park


In [None]:
# Demonstration of Monkey learn custom trained model
lvla = getLvl("Scammers scam the population")
lvlb = getLvl("Coronavirus scammers steal $1.1 million from fearful victims")
print(lvla)
print(lvlb)

In [None]:
# Sources to scrape
def getUrls():
    url = []
    #url.append('https://www.cyber.gov.au/acsc/view-all-content/alerts')
    url.append('https://www.smh.com.au/topic/nsw-police-jdi')
    url.append('https://www.smh.com.au/topic/sydney-crime-62n')
    url.append('https://www.smh.com.au/topic/crime-5w4')
    url.append('https://www.smh.com.au/topic/australian-federal-police-jnt')
    url.append('https://www.9news.com.au/cyber-security')
    url.append('https://www.9news.com.au/security')
    url.append('https://www.rapid7.com/db/?q=&type=nexpose')
    url.append('https://www.rapid7.com/db/?q=&type=nexpose&page=2')
    url.append('https://www.rapid7.com/db/?q=&type=nexpose&page=3')
    return url

In [None]:
# Reading the html data from the url to p_html
def urlReq(my_url):
    uClient = uReq(my_url)
    p_html = uClient.read()
    uClient.close()
    return soup(p_html, "html.parser")

In [None]:
# Cyber.gov.au
def cga(parsed_html):
    #Splitting the html into a list of threats
    articles = parsed_html.findAll("div", {"class":"views-row"})
    #deleting unrequired row from the list
    del articles[0]
    #Creating .csv file in the data folder and writing the required information from the list to .csv file
    with open('data/cyber_gov_au.csv', 'w') as file:
        w = csv.writer(file)
        #write header row
        w.writerow(['Threat_date', 'Title', 'Summary', 'Source', 'Location', 'Threat_level'])
        for article in articles:
            dateS = article.find("p", {"class":"acsc-date"}).get_text()
            title = article.find("p", {"class":"acsc-title"}).get_text()
            summary = article.find("p", {"class":"acsc-summary"}).get_text()
            src_link = ("https://www.cyber.gov.au" + article.a['href'])

            d = datetime.strptime(dateS, '%d %b %Y ')

            uClient = uReq(src_link)
            detailed_page = uClient.read()
            uClient.close()
            dp_parsed = soup(detailed_page, "html.parser")
            Threat = dp_parsed.find("div", {"""class":"field field--name-field-alert-status 
                                            field--type-entity-reference field--label-inline"""})
            lvl = Threat.find("div", {"class":"field__item"}).get_text()

            summary = summary.replace(",", ";")
            w.writerow([d.date(), title, summary, src_link, "", lvl])


In [None]:
# Sydney Morning Herald
def smh(parsed_html, url):
    articles = parsed_html.findAll("div", {"class":"_2g9tm"})
    #Creating csv file
    fname = "smh_" + ('_'.join(re.findall(r"(\w+)", url.rsplit('/', 1)[-1])))
    with open ('data/%s.csv' % (fname), 'w') as file:
        w = csv.writer(file)
        #write header row
        w.writerow(['Threat_date', 'Title', 'Summary', 'Source', 'Location', 'Threat_level'])
        #grabbing news article information
        for article in articles:
            title = article.find("a",{"data-test": "article-link"}).get_text()
            summary = article.find("p",{"class": "_3b7W- _3XEsE"}).get_text()
            src_link = ("https://www.smh.com.au" + article.find("a",{"data-test": "article-link"}).get('href'))
            dateS = article.find("time", {"class": "_2_zR-"}).get_text()
            title = title.replace('"', "")
            title = title.replace("'", "")
            summary = summary.replace('"', "")
            summary = summary.replace("'", "")
            
            loc = getLoc(title + ". " + summary).lstrip()
            lvl = getLvl(title + ". " + summary)
            
            d = date.today()
            if("Today" in dateS or "ago" in dateS):
                w.writerow([d, title, 0,summary, url, "-"])
            else:
                d = datetime.strptime(dateS, '%B %d, %Y')
                w.writerow([d.date(), title, summary, src_link, loc, lvl])


In [None]:
# 9News
def nine(parsed_html, url):
    feed = parsed_html.find("div", {"data-feed":"default"})
    articles = feed.findAll("div", {"class":"story__details"})
    #Creating csv file
    fname = "9news_" + ('_'.join(re.findall(r"(\w+)", url.rsplit('/', 1)[-1])))
    with open ('data/%s.csv' % (fname), 'w') as file:
        w = csv.writer(file)
        #write header row 
        w.writerow(['Threat_date', 'Title', 'Summary', 'Source', 'Location', 'Threat_level'])
        #grabbing news article information
        for article in articles:
            if(article.find("div",{"class": "widget widget-ad feed__ad"})):
                pass
            else:
                title = article.find("span",{"class": "story__headline__text"}).get_text()
                summary = article.find("div", {"class": "story__abstract"}).get_text()
                src_link = article.a['href']
                dateS = article.find("time", {"class": "story__time"}).get_text()
                title = title.replace('"', "")
                title = title.replace("'", "")
                summary = summary.replace('"', "")
                summary = summary.replace("'", "")
                
                loc = getLoc(title + ". " + summary).lstrip()
                lvl = getLvl(title + ". " + summary)
                
                d = date.today()
                if("Today" in dateS or "ago" in dateS):
                    w.writerow([d, title, 0,summary, url, "-"])
                else:
                    d = datetime.strptime(dateS, '%I:%M%p %b %d, %Y')
                    w.writerow([d.date(), title, summary, src_link, loc, lvl])

In [None]:
# Rapid7
def rapid(parsed_html, url):
    articles = parsed_html.findAll("a", {"class":"vulndb__result resultblock"})
    #Creating csv file
    fname = "rapid7_" + ('_'.join(re.findall(r"(\w+)", url.rsplit("nexpose", 1)[-1])))
    with open ('data/%s.csv' % (fname), 'w') as file:
        w = csv.writer(file)
        #Creating Headers for csv file 
        w.writerow(['Threat_date', 'Title', 'Summary', 'Source', 'Location', 'Threat_level'])
        #grabbing news threat information
        for article in articles:
            title = article.find("div",{"class": "resultblock__info-title"}).get_text()
            src_link = ("https://www.rapid7.com" + article.get('href'))
            meta = article.find("div", {"class": "resultblock__info-meta"}).get_text()
            meta = meta.lstrip().rstrip()
            dateS = meta[11:30].rstrip()
            lvl = int(meta[meta.find("Severity:")+9:].split()[0].lstrip())
            d = date.today()
            if("Today" in dateS or "ago" in dateS):
                w.writerow([d, title, 0,"-", url, "-"])
            else:
                d = datetime.strptime(dateS, '%B %d, %Y')
                w.writerow([d.date(), title.lstrip().rstrip(), "", src_link, "", lvl])
                

# Main program

In [None]:
# Get list of URLs
urls = getUrls()
c = 0 # Counter

# Loop through urls and call funcations to scrape data and identify entities using ML
for url in urls:
    pHtml = urlReq(url)
    if('cyber.gov.au' in url):
        cga(pHtml)
    if('smh' in url):
        smh(pHtml, url)
    if('9news' in url):
        nine(pHtml, url)
    if('rapid7' in url):
        rapid(pHtml, url)
    c+=1
    clear_output(wait=True)
    print((str(c/len(urls)*100) +  "%"))