In [3]:
import requests
from unidecode import unidecode
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
db = "https://asn.flightsafety.org/database/"
base = "https://asn.flightsafety.org"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
}
pg = requests.get(db, headers=headers)
soup = BeautifulSoup(pg.content, "lxml")

In [None]:
#Extract all URLs from the database
outer = []
struct = soup.find_all('a')
for a in struct:
    try:
        href = a['href']
        if 'year' in href.lower():
            outer.append(base + href)
    except KeyError:
        continue
outer.pop(0)

In [5]:
#Get inner page URLs
inner = []
for url in outer:
    inner.append(url)
    pg = requests.get(url, headers=headers)
    sp  = BeautifulSoup(pg.content, "lxml")
    pgnum = sp.find('div', class_='pagenumbers') 
    if pgnum:
        links = pgnum.find_all('a')
        inner.extend(base + link['href'] for link in links)
    else:
        continue   


In [None]:
#Obtain table URLs of incidence
incident_urls = []
for dir in inner:
    pg = requests.get(dir, headers=headers)
    sp  = BeautifulSoup(pg.content, "lxml")
    tab = sp.find('table', class_='hp')
    if tab:
        rows = tab.find_all('tr')
        for index, row in enumerate(rows):
            cells = row.find_all('td')
            if len(cells)>0:
                incident_urls.append(base + cells[0].find('a')['href'])
            else:
                continue
print(incident_urls)

In [47]:
#Write URLs to txt file for re-use
with open("incident_urls.txt", "w") as f:
    for url in incident_urls:
        f.write(url + "\n")

In [None]:
#Scrape the pages with the incidence information.
def scrape_incident(url):
    incident = {}
    print(f'Visiting: {url}')
    pg = requests.get(url, headers=headers)
    sp = BeautifulSoup(pg.content, "lxml")
    table = sp.find('table')
    if table:
        rows = table.find_all('tr')
        for row in rows:
            cells = row.find_all('td')
            val = [cell.get_text(strip=True) for cell in cells]
            if val[0] == 'Fatalities:':
                fat = val[1].split('/')[0].split(':')[1].strip()
                occ = val[1].split('/')[1].split(':')[1].strip()
                incident['Fatalities'] = fat
                incident['Occupants'] = occ
            else:
                incident[val[0].replace(":","")] = val[1]
    narr = sp.find_all('span')[-1].get_text(strip=True)
    incident['Narrative'] = narr
    print(incident)
    return incident

with open("incident_urls.txt", "r") as f:
    urls = [line.strip() for line in f]

dataset = []
try:
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(scrape_incident, url) for url in urls]
        for future in as_completed(futures):
            try:
                result = future.result()
                dataset.append(result)
            except Exception as e:
                print(f"Error: {e}")
except KeyboardInterrupt:
    print("Keyboard interrupt received. Stopping threads...")

df = pd.DataFrame(dataset)

In [7]:
df.to_csv('aviation_accidents.csv')

In [62]:
#Data cleaning
import pandas as pd
import numpy as np
from unidecode import unidecode
import re

df = pd.read_csv('aviation_accidents.csv',index_col=0)

In [63]:
keys = [
    'date',
    'time',
    'type',
    'operator',
    'registration',
    'msn',
    'yr_manufacture',
    'engine',
    'fatal',
    'occup',
    'other_fatal',
    'damage',
    'category',
    'location',
    'phase',
    'nature',
    'dep',
    'dest',
    'confid',
    'narrative',
    'tot_airframe_hrs',
    'cycles',
    'inv_agency'
]


In [64]:
#New column headers
df.columns = keys
df = df.drop(columns=['time'])

In [65]:
#Replace non-UTF characters with ASCII equivalents
df = df.map(lambda x: unidecode(x) if isinstance(x, str) else x)

In [66]:
def fix_date(val):
    try:
        return pd.to_datetime(val, format="%A %d %B %Y")
    except Exception:
        match = re.search(r"\b(\d{4})\b", str(val))
        if match:
            return pd.to_datetime(match.group(1), format="%Y")
        else:
            return pd.NaT
        
df['date'] = df['date'].apply(fix_date)
df['yr_manufacture'] = df['yr_manufacture'].astype('Int64')

In [67]:
df.replace('-', np.nan, inplace=True)

In [None]:
df

In [69]:
df.to_csv('aviation.csv')