In [None]:
#Imports

from bs4 import BeautifulSoup
import pandas as pd
import time
import urllib.request
import requests
import json
import numpy as np
from datetime import datetime, timezone, timedelta
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import undetected_chromedriver as uc


In [None]:
#Function to get regular flights codes between 2 given airports

def get_regular_flights(origin, destination):
    airlines_codes = ['DLH', 'KLM', 'AEE', 'BAW', 'IBE', 'DAL', 'AFR', 'ITY', 'ELY']
    airlines_codes_short = ['DL', 'KL', 'AE', 'BA', 'IB', 'DA', 'AF', 'IT', 'EL']
    url = 'https://www.flightradar24.com/v1/search/web/find?query=%(origin)s-%(destination)s&limit=50' % {"origin": origin, "destination": destination}
    flights = []
    response = requests.get(url)
    json_flights = response.json()
    for result in json_flights['results']:
        if result['type'] == 'schedule':
            try:
                if result['detail']['operator'][:3] in airlines_codes:
                    flights.append(result['id'])
            except:
                try:
                    if result['detail']['callsign'][:3] in airlines_codes:
                        flights.append(result['id'])
                except:
                    if result['id'][:2] in airlines_codes_short:
                        flights.append(result['id'])
    return flights

In [None]:
#Function to get all regular flight codes

def get_all_reg_flights():
    airports_codes = ['TLV', 'LHR', 'MAD', 'JFK', 'CDG', 'FCO', 'ATH', 'AMS', 'FRA', 'BKK', 'LIS', 'DXB', 'SAW']
    flights = []
    for i in range(len(airports_codes)):
        for j in range(len(airports_codes)):
            if i != j:
                flights_to_add = get_regular_flights(airports_codes[i], airports_codes[j])
                for flight in flights_to_add:
                    flights.append(flight)
    return flights

In [None]:
#Function to get data about last flights for each flight code

def get_data_for_flight(code, driver):
    url = 'https://www.flightradar24.com/data/flights/%s' %code
    df = pd.DataFrame()
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    while('please wait' in soup):
        time.sleep(1)
        soup = BeautifulSoup(driver.page_source, 'html.parser')        
    table = None
    try:
        table = soup.find('table').tbody
    except:
        table = None
    if table == None:
        return df.T
    for tr in table.find_all('tr'):
        row = []
        row.append(code)
        tds = tr.find_all('td')[2:9]
        if '—' not in tds[4].text:
            for td in tds:
                row.append(td.text)
        if len(row) != 1:
            row_seriers = pd.Series(row)
            df = pd.concat([df, row_seriers], axis= 1)
    return df.T

In [None]:
#Function to get data for all our flight codes and add it to our dataframe

def get_data_all_flights(flights):
    df = pd.DataFrame()
    driver = uc.Chrome()
    for i in range(len(flights)):
        print (len(flights) + ' more flight codes remaining')
        time.sleep(3)
        df_for_code = get_data_for_flight(flights[i], driver)
        if not (df_for_code.empty):
            df = pd.concat([df, df_for_code], axis= 0,ignore_index=True)
    driver.quit()
    return df.copy()

In [None]:
#Main

flights = get_all_reg_flights()
len(flights)

In [None]:
df = get_data_all_flights(flights)
features = {0:'Code', 1:'Date',2: 'Origin', 3:'Destination',4: 'Aircraft',5: 'Flight time',6: 'Scheduled Departure',7: 'Actual Departure'}
df=df.rename(columns=features)
df.to_csv('raw_data.csv')