In [219]:
import requests,re
from unidecode import unidecode
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [220]:
url="https://en.wikipedia.org/wiki/List_of_deadliest_aircraft_accidents_and_incidents"
pg = requests.get(url)
soup = BeautifulSoup(pg.content, "lxml")

In [221]:
#Extracting airlines from the Incidence Column
def extract_airlines(incident):
    if 'Flight' in incident:
        matches = re.findall(r'([A-Za-z\s&\.\-]+?)\s*Flight', incident)
        airlines = [m.strip() for m in matches]
        return ' '.join(airlines) if airlines else incident
    else:
        return incident

In [222]:
#Obtaining the table of keys 
df_keys = []
tb = soup.find_all('td', class_='col-break')
for i, td in enumerate(tb):
    table = td.find('table', class_='wikitable')
    if table:
        rows = table.find_all('tr')
        for row in rows:
            cells = row.find_all(['th', 'td'])
            text = [cell.get_text(strip=True).replace('\xa0', ' ') for cell in cells]
            if len(text) > 2:
                text.pop(0)
            df_keys.append(text)
key = pd.DataFrame(df_keys)
key.columns = key.iloc[0]
key = key.drop(index=[0, 5, 6, 7, 8, 15]).reset_index(drop=True)      
key

Unnamed: 0,Abbreviation,Definition
0,Tot,Total
1,C,Crew
2,P,Passenger
3,G,Ground
4,COM,Commercial (accident/incident)
5,MIL,Military (accident/incident)
6,INB,Bombing
7,INH,Hijacking
8,EXG,Attacked using ground-based weapons
9,EXS,Attacked by other aircraft


In [223]:
data_table = soup.find("table", class_='wikitable sortable')
datarows = data_table.find_all('tr')
df = []
for index, row in enumerate(datarows):
    cells = row.find_all(['td','th'])
    rowval = [re.sub(r'\[[^\]]*\]', '',cell.get_text(strip = True)).replace('\xa0', ' ') for cell in cells]
    df.append(rowval)
df[0].pop(0)
header = df[1] + df[0]
df = df[2:-1]
df.insert(0,header)
data = pd.DataFrame(df)
data.columns = data.iloc[0]
data = data.drop(index = 0).reset_index(drop=True)
data 

Unnamed: 0,Tot,C,P,G,N,T,Incident,Aircraft,Location,Phase,Airport,Distance,Date
0,"est. 1,700",11,81,"est. 1,600",†,INH,American Airlines Flight 11,Boeing 767-223ER,"usnewyneNew York City, New York, U.S.",ENR,,,2001-09-11
1,"est. 1,000",9,56,est. 900,†,INH,United Airlines Flight 175,Boeing 767-222,"usnewyneNew York City, New York, U.S.",ENR,,,2001-09-11
2,583,23,560,0,‡,COM,Pan Am Flight 1736 andKLM Flight 4805,Boeing 747-121andBoeing 747-206Bspct,"Tenerife, Spain",TXI/TOF,TFN,,1977-03-27
3,520,15,505,0,,COM,Japan Air Lines Flight 123,Boeing 747SR-46,"juMount Takamagahara,Ueno, Japan",ENR,,,1985-08-12
4,349,33,316,0,†,COM,Saudia Flight 763 andKazakhstan Airlines Fligh...,Boeing 747-168BandIlyushin Il-76TD,"indicCharkhi Dadri, India",ENR,,,1996-11-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,50,4,46,0,†,COM,Capital Airlines Flight 20,Vickers Viscount 745D,"usvirhHoldcroft, Virginia, U.S.",ENR,,,1960-01-18
557,50,5,45,0,†,COM,Scottish Airlines (G-ANSY),Avro York,"maltzŻurrieq, Malta",ENR,,,1956-02-18
558,50,9,41,0,†,COM,Pan Am Flight 202,Boeing 377-10-26,"brsannearSantana do Araguaia, Brazil",ENR,,,1952-04-29
559,50,6,44,0,†,COM,United Air Lines Flight 615,Douglas DC-6B,"uscauUnion City, California, U.S.",APR,OAK,22 km (12 NM),1951-08-24


In [224]:
#Data clearning
data = data.replace('', np.nan)
data['Location'] = data['Location'].str.replace(r'^[a-z]+', '', regex=True)
data = data.replace({'\n': ''}, regex=True)
data = data.replace({r'\s+': ' '}, regex=True)
data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)
data['Aircraft'] = data['Aircraft'].str.replace(r'\s*and\s*', ' and ', regex=True).str.replace(r'\s+', ' ', regex=True).str.strip()
data['Incident'] = data['Incident'].str.replace(r'\s*and\s*', ' and ', regex=True).str.replace(r'\s+', ' ', regex=True).str.strip()
data['Tot'] = (
    data['Tot']
    .str.replace('est\.', '', regex=True)  
    .str.replace('\+', '', regex=True)      
    .str.replace(',', '', regex=True)       
    .str.strip()
)
data['G'] = (
    data['G']
    .str.replace('est\.', '', regex=True)  
    .str.replace('\+', '', regex=True)      
    .str.replace(',', '', regex=True)       
    .str.strip()
)
#Replace non-UTF characters with ASCII equivalents
data['Incident'] = data['Incident'].apply(lambda x: unidecode(x) if isinstance(x, str) else x)
data['Location'] = data['Location'].apply(lambda x: unidecode(x) if isinstance(x, str) else x)

#Extract Airline from incident column into a separate column
data['Airline'] = data['Incident'].apply(extract_airlines)

#Create dummies for categorical variables
N_dummies = pd.get_dummies(data['N'], dtype = int)
N_dummies = N_dummies.rename(columns={
    '†': 'no_survivor',
    '‡': 'prev_deadliest',
    '1*': 'sole_survivor',
    '‡, †': 'prev_deadliest_no_survivor'
})
#N_dummies = N_dummies.drop(columns=['prev_deadliest_no_survivor']) --> Include line to drop reference column
data = pd.concat([data, N_dummies], axis=1)
data = data.drop(columns=['N'])

T_dummies = pd.get_dummies(data['T'], dtype = int)
#T_dummies = T_dummies.drop(columns=['COM']) --> Include line to drop reference column
data = pd.concat([data, T_dummies], axis=1)
data = data.drop(columns=['T'])

#Converting data types
data['Date'] = pd.to_datetime(data['Date'], format='%Y-%m-%d')
data['Tot'] = data['Tot'].astype(int)
data['C'] = data['C'].astype(float)
data['P'] = data['P'].astype(float)
data['G'] = data['G'].astype(float)
data

  .str.replace('est\.', '', regex=True)
  .str.replace('\+', '', regex=True)
  .str.replace('est\.', '', regex=True)
  .str.replace('\+', '', regex=True)
  data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Unnamed: 0,Tot,C,P,G,Incident,Aircraft,Location,Phase,Airport,Distance,...,sole_survivor,no_survivor,prev_deadliest,prev_deadliest_no_survivor,COM,EXG,EXS,INB,INH,MIL
0,1700,11.0,81.0,1600.0,American Airlines Flight 11,Boeing 767-223ER,"New York City, New York, U.S.",ENR,,,...,0,1,0,0,0,0,0,0,1,0
1,1000,9.0,56.0,900.0,United Airlines Flight 175,Boeing 767-222,"New York City, New York, U.S.",ENR,,,...,0,1,0,0,0,0,0,0,1,0
2,583,23.0,560.0,0.0,Pan Am Flight 1736 and KLM Flight 4805,Boeing 747-121 and Boeing 747-206Bspct,"Tenerife, Spain",TXI/TOF,TFN,,...,0,0,1,0,1,0,0,0,0,0
3,520,15.0,505.0,0.0,Japan Air Lines Flight 123,Boeing 747SR-46,"Mount Takamagahara,Ueno, Japan",ENR,,,...,0,0,0,0,1,0,0,0,0,0
4,349,33.0,316.0,0.0,Saudia Flight 763 and Kazakhstan Airlines Flig...,Boeing 747-168B and Ilyushin Il-76TD,"Charkhi Dadri, India",ENR,,,...,0,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,50,4.0,46.0,0.0,Capital Airlines Flight 20,Vickers Viscount 745D,"Holdcroft, Virginia, U.S.",ENR,,,...,0,1,0,0,1,0,0,0,0,0
557,50,5.0,45.0,0.0,Scottish Airlines (G-ANSY),Avro York,"Zurrieq, Malta",ENR,,,...,0,1,0,0,1,0,0,0,0,0
558,50,9.0,41.0,0.0,Pan Am Flight 202,Boeing 377-10-26,"Santana do Araguaia, Brazil",ENR,,,...,0,1,0,0,1,0,0,0,0,0
559,50,6.0,44.0,0.0,United Air Lines Flight 615,Douglas DC-6B,"Union City, California, U.S.",APR,OAK,22 km (12 NM),...,0,1,0,0,1,0,0,0,0,0


In [225]:
data.dtypes

Tot                                    int64
C                                    float64
P                                    float64
G                                    float64
Incident                              object
Aircraft                              object
Location                              object
Phase                                 object
Airport                               object
Distance                              object
Date                          datetime64[ns]
Airline                               object
sole_survivor                          int64
no_survivor                            int64
prev_deadliest                         int64
prev_deadliest_no_survivor             int64
COM                                    int64
EXG                                    int64
EXS                                    int64
INB                                    int64
INH                                    int64
MIL                                    int64
dtype: obj

In [226]:
data.to_csv("crashes.csv")
with pd.ExcelWriter('crashes.xlsx') as writer:
    key.to_excel(writer, sheet_name='key', index=False)
    data.to_excel(writer, sheet_name='data', index=False)