# *Identification Module - Web Parsing*
This is the Identification Module that pulls data from selected resources on the internet and writes them to CSV files in the data folder.

The data assessment module will then use this data for assessment and provide a JSON output..

**NOTE**: Some sources have threats that have already been assessed so will directly give an output skipping the assessment stage

In [None]:
# Library imports
import csv
import tweepy
import re
import json
import bs4
import pandas as pd
import requests
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
from datetime import datetime, date

In [2]:
def getUrls():
    url = []
    url.append('https://www.cyber.gov.au/acsc/view-all-content/alerts')
    url.append('https://www.smh.com.au/topic/nsw-police-jdi')
    url.append('https://www.smh.com.au/topic/sydney-crime-62n')
    url.append('https://www.smh.com.au/topic/crime-5w4')
    url.append('https://www.smh.com.au/topic/australian-federal-police-jnt')
    url.append('https://www.9news.com.au/cyber-security')
    url.append('https://www.9news.com.au/security')
    url.append('https://www.rapid7.com/db/?q=&type=nexpose')
    url.append('https://www.rapid7.com/db/?q=&type=nexpose&page=2')
    url.append('https://www.rapid7.com/db/?q=&type=nexpose&page=3')
    return url

In [3]:
#Reading the html data from the url to p_html
def urlReq(my_url):
    uClient = uReq(my_url)
    p_html = uClient.read()
    uClient.close()
    return soup(p_html, "html.parser")

In [4]:
def cga(parsed_html):
    #Splitting the html into a list of threats
    articles = parsed_html.findAll("div", {"class":"views-row"})
    #deleting unrequired row from the list
    del articles[0]
    #Creating .csv file in the data folder and writing the required information from the list to .csv file
    with open('data/cyber_gov_au.csv', 'w') as file:
        w = csv.writer(file)
        #write header row
        w.writerow(['Threat_date', 'Title', 'Threat_level', 'Summary', 'Source', 'Location'])
        for article in articles:
            dateS = article.find("p", {"class":"acsc-date"}).get_text()
            title = article.find("p", {"class":"acsc-title"}).get_text()
            summary = article.find("p", {"class":"acsc-summary"}).get_text()
            src_link = ("https://www.cyber.gov.au" + article.a['href'])

            d = datetime.strptime(dateS, '%d %b %Y ')

            uClient = uReq(src_link)
            detailed_page = uClient.read()
            uClient.close()
            dp_parsed = soup(detailed_page, "html.parser")
            Threat = dp_parsed.find("div", {"class":"field field--name-field-alert-status field--type-entity-reference field--label-inline"})
            lvl = Threat.find("div", {"class":"field__item"}).get_text()

            summary = summary.replace(",", ";")
            w.writerow([d.date(), title, lvl, summary, src_link, "-"])


In [5]:
def smh(parsed_html, url):
    articles = parsed_html.findAll("div", {"class":"_2g9tm"})
    #Creating csv file
    fname = "smh_" + ('_'.join(re.findall(r"(\w+)", url.rsplit('/', 1)[-1])))
    with open ('data/%s.csv' % (fname), 'w') as file:
        w = csv.writer(file)
        
    #Creating Headers for csv file 
        w.writerow(['Threat_date', 'Title', 'Threat_level', 'Summary', 'Source', 'Location'])

    #grabbing news article information
        for article in articles:
            title = article.find("a",{"data-test": "article-link"}).get_text()
            summary = article.find("p",{"class": "_3b7W- _3XEsE"}).get_text()
            src_link = ("https://www.smh.com.au" + article.find("a",{"data-test": "article-link"}).get('href'))
            dateS = article.find("time", {"class": "_2_zR-"}).get_text()
            d = date.today()
            if("Today" in dateS):
                w.writerow([d, title, 0,summary, url, "-"])
            else:
                d = datetime.strptime(dateS, '%B %d, %Y')
                w.writerow([d.date(), title, 0,summary, src_link, "-"])


In [6]:
def nine(parsed_html, url):
    feed = parsed_html.find("div", {"data-feed":"default"})
    articles = feed.findAll("div", {"class":"story__details"})
    #Creating csv file
    fname = "9news_" + ('_'.join(re.findall(r"(\w+)", url.rsplit('/', 1)[-1])))
    with open ('data/%s.csv' % (fname), 'w') as file:
        w = csv.writer(file)
        
    #Creating Headers for csv file 
        w.writerow(['Threat_date', 'Title', 'Threat_level', 'Summary', 'Source', 'Location'])

    #grabbing news article information
        for article in articles:
            if(article.find("div",{"class": "widget widget-ad feed__ad"})):
                pass
            else:
                title = article.find("span",{"class": "story__headline__text"}).get_text()
                summary = article.find("div", {"class": "story__abstract"}).get_text()
                src_link = article.a['href']
                dateS = article.find("time", {"class": "story__time"}).get_text()
                d = date.today()
                if("Today" in dateS or "ago" in dateS):
                    w.writerow([d, title, 0,summary, url, "-"])
                else:
                    d = datetime.strptime(dateS, '%I:%M%p %b %d, %Y')
                    w.writerow([d.date(), title, 0,summary, src_link, "-"])


In [7]:
def rapid(parsed_html, url):
    articles = parsed_html.findAll("a", {"class":"vulndb__result resultblock"})
    #Creating csv file
    fname = "rapid7_" + ('_'.join(re.findall(r"(\w+)", url.rsplit("nexpose", 1)[-1])))
    with open ('data/%s.csv' % (fname), 'w') as file:
        w = csv.writer(file)
        
    #Creating Headers for csv file 
        w.writerow(['Threat_date', 'Title', 'Threat_level', 'Summary', 'Source', 'Location'])

    #grabbing news threat information
        for article in articles:
            title = article.find("div",{"class": "resultblock__info-title"}).get_text()
            src_link = ("https://www.rapid7.com" + article.get('href'))
            meta = article.find("div", {"class": "resultblock__info-meta"}).get_text()
            meta = meta.lstrip().rstrip()
            dateS = meta[11:30].rstrip()
            lvl = int(meta[meta.find("Severity:")+9:].split()[0].lstrip())
            d = date.today()
            if("Today" in dateS):
                w.writerow([d, title, 0,"-", url, "-"])
            else:
                d = datetime.strptime(dateS, '%B %d, %Y')
                w.writerow([d.date(), title, lvl,"-", src_link, "-"])
                

In [8]:
# Main program
urls = getUrls()
for url in urls:
    pHtml = urlReq(url)
    if('cyber.gov.au' in url):
        cga(pHtml)
    if('smh' in url):
        smh(pHtml, url)
    if('9news' in url):
        nine(pHtml, url)
    if('rapid7' in url):
        rapid(pHtml, url)
    print(url)

https://www.cyber.gov.au/acsc/view-all-content/alerts
https://www.smh.com.au/topic/nsw-police-jdi
https://www.smh.com.au/topic/sydney-crime-62n
https://www.smh.com.au/topic/crime-5w4
https://www.smh.com.au/topic/australian-federal-police-jnt
https://www.9news.com.au/cyber-security
https://www.9news.com.au/security
https://www.rapid7.com/db/?q=&type=nexpose
https://www.rapid7.com/db/?q=&type=nexpose&page=2
https://www.rapid7.com/db/?q=&type=nexpose&page=3
