# Scrape Web AIS Data to Test Model

This notebook is a scraper to get data from the web. AIS data are available from several websites and some even comes with api calls, but the data is quite expensive. We only need a small portion of data for the project, hence we use a small scraper to obtain enough current data on a single vessel to test our model. The scraper is ran locally to output a csv.

The scraped data would need to join with 4 other external datasets, to add additional features and to preprocess in order to feed it into our model for prediction.

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from datetime import datetime, timedelta
import time


In [None]:
# depricated!! - need to change iframe
# using selenium on marinetraffic to login and navigate to page to view ais data (depricated due to change in iframe)
# need to use own credentials with EMAIL_USER, PASSWORD_USER

driver = webdriver.Chrome()
driver.get("https://www.marinetraffic.com/en/data/?asset_type=vessels&columns=flag,shipname,photo,recognized_next_po, EMAIL_USERrt,reported_eta,reported_destination,current_port,imo,ship_type,show_on_live_map,time_of_latest_position,lat_of_latest_position,lon_of_latest_position,notes")

try:
    element_present = EC.visibility_of_element_located((By.CSS_SELECTOR, 'css-flk0bs'))
    WebDriverWait(driver, 20).until(element_present)
    print("Page is ready!")
except TimeoutException:
    print("whatever...continue!")
    
result = driver.find_element_by_class_name('css-flk0bs')
result.click()
result = driver.find_element_by_class_name('e2e_header_sign_in_button')
result.click()

try:
    element_present = EC.visibility_of_element_located((By.ID, 'email'))
    WebDriverWait(driver, 20).until(element_present)
    print("Page is ready!")
except TimeoutException:
    print("whatever...continue!")

username = driver.find_element_by_id("email")
password = driver.find_element_by_id("password")

username.send_keys("EMAIL_USER")
password.send_keys("PASSWORD_USER")

driver.find_element_by_id("login_form_submit").click()


try:
    element_present = EC.visibility_of_element_located((By.ID, 'user-logggin'))
    WebDriverWait(driver, 20).until(element_present)
    print("Page is ready!")
except TimeoutException:
    print("whatever...continue!")

boat = driver.find_element_by_id("user-logggin")
boat.click()

try:
    element_present = EC.visibility_of_element_located((By.ID, 'nw_my_fleets'))
    WebDriverWait(driver, 20).until(element_present)
    print("Page is ready!")
except TimeoutException:
    print("whatever...continue!")

boat = driver.find_element_by_id("nw_my_fleets")
boat.click()

# using 2 dummies to wait until page is correctly loaded (iframe)
try:
    element_present = EC.visibility_of_element_located((By.ID, 'dummywait'))
    WebDriverWait(driver, 20).until(element_present)
    print("Page is ready!")
except TimeoutException:
    print("whatever...continue!")

driver.find_element_by_xpath('//a[contains(text(), "Fishing")]').click()

try:
    element_present = EC.visibility_of_element_located((By.ID, 'dummywait'))
    WebDriverWait(driver, 20).until(element_present)
    print("Page is ready!")
except TimeoutException:
    print("whatever...continue!")

driver.find_element_by_xpath('//a[contains(text(), "SUNDEROEY")]').click()

        
try:
    element_present = EC.visibility_of_element_located((By.ID, 'viewVesselEventsList'))
    WebDriverWait(driver, 20).until(element_present)
    print("Page is ready!")
except TimeoutException:
    print("whatever...continue last!")

html_page = driver.find_element_by_xpath("//body").get_attribute('outerHTML')
# deprecated!!

In [21]:
# using vesselfinder to obtain vessel ais data (much lower resolution)

vessel_name = 'SUNDEROEY'
vessel_IMO = '9294903'
vessel_mmsi = '316042032'

session = requests.Session()
web_page = session.get("https://www.vesselfinder.com/vessels/" + vessel_name + \
                       "-IMO-" + vessel_IMO + "-MMSI-" + vessel_mmsi, \
                       headers={'User-Agent': 'Mozilla/5.0'})

data = {}

soup = BeautifulSoup(web_page.content, 'html.parser')
ship_div = soup.findAll("section", {"class":["ship-section"]})
for div in ship_div:
    ship_table = div.findAll("table", {"class":["tparams"]})
    for table in ship_table:
        rows = table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]
            if len(cols) > 1 and cols[1] not in ['-', '']:
                data[cols[0]] = cols[1]


In [22]:
# parse data string from above to separate into features

course = data['Course / Speed'].split('/')[0].rstrip('° ')
speed = data['Course / Speed'].split('/')[1].rstrip(' kn').lstrip()
lon = data['Coordinates'].split('/')[1]
if 'W' in lon:
    lon = float(lon.rstrip(' W')) * -1
else:
    lon = float(lon.rstrip(' E'))
lat = data['Coordinates'].split('/')[0]
if 'W' in lat:
    lat = float(lat.rstrip(' S')) * -1
else:
    lat = float(lat.rstrip(' N'))
mmsi = data['IMO / MMSI'].split('/')[1].strip()
time = data['Position received']
if 'hours ago' in time:
    time = int(time.split(' ')[0]) * 60
elif 'mins ago' in time:
    time = int(time.split(' ')[0])

In [23]:
# open previous saved target ais data in csv format

try:
    with open('model_application_SUNDEROEY.csv', "r") as f1:
        last_line = f1.readlines()[-1]

    prev_speed = last_line.split(',')[5].rstrip('\n')
    prev_course = last_line.split(',')[4]
    prev_lat = last_line.split(',')[3]
    prev_lon = last_line.split(',')[2]
except Exception:
    print('file not found')
    prev_speed = '0'
    prev_course = '0'
    prev_lat = 0
    prev_lon = 0

In [24]:
# if data is updated then save new line to csv

if prev_speed == speed and prev_course == course and prev_lat == str(lat) and prev_lon == str(lon):
    print('data did not change')
else:
    d = datetime.now() - timedelta(minutes=time)
    timestamp = (d - datetime(1970, 1, 1)).total_seconds()
    line = mmsi + ',' + speed + ',' + course + ','  + str(lon) + ',' + str(lat) + ',' + str(int(timestamp))
    with open('model_application_SUNDEROEY.csv','a') as f:
        f.write(line)
        f.write("\n")