In [None]:
import requests
import random
from bs4 import BeautifulSoup
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import time
import pandas as pd

In [None]:
def get_suburl(url):
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"]
    headers={"User-Agent":random.choice(user_agents)}
    session=requests.Session()
    retries = Retry(total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    session.mount("http://", HTTPAdapter(max_retries=retries))
    session.mount("https://", HTTPAdapter(max_retries=retries))
    try:
        r=session.get(url,headers=headers,timeout=10)
        r.raise_for_status()
        status_code=r.status_code
    except Exception as e:
        print('请求失败',status_code)   
    soup=BeautifulSoup(r.text,'html.parser')
    page=soup.find('div',attrs={'class':'pagenumbers'})
    href_page=page.find_all('a')
    suburl_list=[]
    mainurl_str='https://asn.flightsafety.org'
    for h in href_page:
        suburl_list.append(mainurl_str+h.attrs['href'])
    return suburl_list

In [None]:
def get_urllist(url):
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"]
    headers={"User-Agent":random.choice(user_agents)}
    session=requests.Session()
    retries = Retry(total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    session.mount("http://", HTTPAdapter(max_retries=retries))
    session.mount("https://", HTTPAdapter(max_retries=retries))
    try:
        r=session.get(url,headers=headers,timeout=10)
        r.raise_for_status()
    except Exception as e:
        print('请求失败')     
    soup=BeautifulSoup(r.text,'html.parser')    
    table=soup.table
    n=0
    mainurl_str='https://asn.flightsafety.org'
    url_list=[]
    for tr in table.children:
        n=n+1
        if n<3:
            continue
        report=tr.find_all('td')[8]
        if report.contents:
            if report.img.attrs['title']=='contains link to final investigation report':
                href_url=tr.find('a').attrs['href']
                url_list.append(mainurl_str+href_url)
    return url_list


In [None]:
def get_content(url):
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"]
    headers={"User-Agent":random.choice(user_agents)}
    session=requests.Session()
    retries = Retry(total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    session.mount("http://", HTTPAdapter(max_retries=retries))
    session.mount("https://", HTTPAdapter(max_retries=retries))
    try:
        r=session.get(url,headers=headers,timeout=10)
        r.raise_for_status()
        code=r.status_code
    except Exception as e:
        print('请求失败',e)  
        return 
    soup=BeautifulSoup(r.text,'html.parser') 
    table=soup.table
    table_tr=table.find_all('tr')  
    accident_dict={'Date':'',
                    'Time':'',
                    'Type':'',
                    'Owner/operator':'',
                    'Registration':'',
                    'MSN':'',
                    'Year of manufacture':'',
                    'Total airframe hrs':'',
                    'Cycles':'',
                    'Engine model':'',
                    'Fatalities':'',
                    'Other fatalities':'',
                    'Aircraft damage':'',
                    'Category':'',
                    'Location':'',
                    'Phase':'',
                    'Nature':'',
                    'Departure airport':'',
                    'Destination airport':'',
                    'Investigating agency:':'',
                    'Confidence Rating':'',
                    'Narrative':''}
    for tr in table_tr:
        key=tr.td.text
        value=tr.td.next_sibling.text
        cleaned_value = value.replace("\n", "").replace("\t", "").replace("\xa0", "")
        accident_dict[key[:-1]]=cleaned_value

    span=soup.find('span',attrs={'lang':'en-US'})
    text=span.text
    cleaned_text = text.replace("\n", "").replace("\t", "").replace("\xa0", "")
    accident_dict['Narrative']=cleaned_text
    return accident_dict

In [None]:
def get_finalurl(url):
    final_list=[]
    for year in range(1990,2024):
        year_url=url+str(year)
        url_list=get_urllist(year_url)
        final_list.extend(url_list)
        url_sub=get_suburl(year_url)
        for u in url_sub:
            url_list2=get_urllist(u)
            time.sleep(0.5)
            final_list.extend(url_list2)
    return final_list

In [None]:
url='https://asn.flightsafety.org/database/year/'
fina_list=get_finalurl(url)

In [None]:
len(fina_list)

In [None]:
from tqdm import tqdm
dict_list=[]
point=0
for u in tqdm(fina_list):
    point+=1
    content_dict=get_content(u)
    time.sleep(0.5)
    if content_dict==None:
        print(point)
        break
    else:
        values=list(content_dict.values())
        dict_list.append(values)


In [None]:
columns=['Date',
            'Time',
            'Type',
            'Owner/operator',
            'Registration',
            'MSN',
            'Year of manufacture',
            'Total airframe hrs',
            'Cycles',
            'Engine model',
            'Fatalities',
            'Other fatalities',
            'Aircraft damage',
            'Category',
            'Location',
            'Phase',
            'Nature',
            'Departure airport',
            'Destination airport',
            'Investigating agency:',
            'Confidence Rating',
            'Narrative']
df=pd.DataFrame(dict_list,columns=columns)
df.to_excel('Accidents_ASN.xlsx',index=None)