In [2]:
import time
import os
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
from datetime import datetime, timedelta
from dotenv import load_dotenv

load_dotenv()

mongoURI = os.getenv('MONGO_URI')

In [3]:
def get_day_difference(start_date, end_date):
    if(isinstance(start_date, str)):
        start_date = datetime.strptime(start_date, '%d %m %Y')
    if(isinstance(end_date, str)):
        end_date = datetime.strptime(end_date, '%d %b %Y')
    date_difference = end_date - start_date
    return date_difference.days

In [4]:
def get_flight_rows(url):
    #Get HTML from URL
    res = requests.get(url)
    htmlData = res.content
    parsedData = BeautifulSoup(htmlData, 'html.parser')
        
    #Rows with departure data
    info_table = parsedData.find_all('tr',class_='tt-row')
    return info_table

In [5]:
def get_day_url(airpot_code, year, month, day, mode):
        hour_subtraction = 1
        while(True):
                #Setting date to a datetime object
                date_time = datetime(year,month,day)
                converted_date = date_time - timedelta(hours=hour_subtraction)
                
                #Convert date to epoch time needed for url
                epochDate = time.mktime(converted_date.timetuple())
                
                #Generate URL for scraping arrivals
                if(mode == 'arrivals'):
                        url = 'https://www.avionio.com/en/airport/' + airpot_code + '/arrivals' + '?ts=' + str(int(epochDate)) + '000&page='
                if(mode == 'departures'):
                        url = 'https://www.avionio.com/en/airport/' + airpot_code + '/departures' + '?ts=' + str(int(epochDate)) + '000&page='
                
                #Parse URL to get first flight date
                flights = get_flight_rows(url)
                first_flight_date = flights[0].find('td',class_='tt-d').text.strip() + ' 2024'
                
                #If the shown information is from the day before then it is safe to begin scraping from there for the date selected.
                #This way it is assured that no flights from the selected date are missed.
                if(get_day_difference(converted_date,first_flight_date) == -1):
                    return url
        
                hour_subtraction += 1

In [6]:
def get_airport_data(airport_code,year,month,day,mode):
    
    datetime_day = datetime(year,month,day)
    url = get_day_url(airport_code,year,month,day,mode)
    
    flights = []
    pageNum = 0
    
    #Generate URL for scraping
    while(True):
        
        info_table = get_flight_rows(url + str(pageNum))
        
        for row in info_table:
        
            if 'tt-child' in row['class']:  
                continue
            
            row_date = row.find('td',class_='tt-d').text.strip() + ' 2024'
            day_difference = get_day_difference(datetime_day, row_date)

            if(day_difference == -1):
                continue
            
            if(day_difference == 1):
                flight_day = datetime.strptime(f'{day} {month} {year}', '%d %m %Y')
                flight_day = str(flight_day.date())
                return { 'date' : flight_day, 'flights' : flights }
            
            flight = {
                'time': row.find('td',class_='tt-t').text.strip(),
                'date': row.find('td',class_='tt-d').text.strip() + ' 2024',
                'iata': row.find('td',class_='tt-i').text.strip(),
                'arrival/departure': row.find('td',class_='tt-ap').text.strip(),
                'flight': row.find('td',class_='tt-f').text.strip(),
                'airline': row.find('td',class_='tt-al').text.strip(),
                'status': row.find('td',class_='tt-s').text.strip()
            }
                        
            flights.append(flight)
        pageNum += 1

In [7]:
client = MongoClient(mongoURI)

def insert_flights_barcelona(day,month,year):
    
    flight_day = datetime.strptime(f'{day} {month} {year}', '%d %m %Y')
    flight_day = str(flight_day.date())
    
    db = client['Flights']
    col_arrivals = db[f'arrivals_barcelona']
    col_departures = db[f'departures_barcelona']
    
    if ( col_arrivals.find_one({'date': { '$eq': flight_day }}) ):
        print(f'Arrivals for day {flight_day} already in database')
        pass
    else:
        flights = get_airport_data('bcn', year, month, day,'arrivals')
        col_arrivals.insert_one(flights)
        print(f"Inserted arrivals for day: {flight_day}")
    
    if ( col_departures.find_one({'date': { '$eq': flight_day }}) ):
        print(f'Departures for day {flight_day} already in database')
        pass
    else:
        flights = get_airport_data('bcn', year, month, day,'departures')
        col_departures.insert_one(flights)
        print(f"Inserted departures for day: {flight_day}")
        
        
insert_flights_barcelona(20,4,2024)

Inserted arrivals for day: 2024-04-20
Inserted departures for day: 2024-04-20
