# Web Scraping Google Flights data

In [None]:
import csv
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta

def scrape_listings(soup):
    return soup.select('li.pIav2d')

def scrape_company_name(listing):
    airline_element = listing.select_one('div.sSHqwe.tPgKwe.ogfYpf')
    return airline_element.text.strip() if airline_element else None

def scrape_flight_duration(listing):
    duration_element = listing.select_one('div.gvkrdb.AdWm1c.tPgKwe.ogfYpf')
    return duration_element.text.strip() if duration_element else None

def scrape_price(listing):
    price_element = listing.select_one('span[data-gs]')
    return price_element.text.strip() if price_element else None

def scrape_co2_emission(listing):
    co2_element = listing.select_one('div.AdWm1c.lc3qH.ogfYpf.PtgtFe')
    return co2_element.text.strip() if co2_element else None

def scrape_flight_stops(listing):
    stops_element = listing.select_one('div.EfT7Ae.AdWm1c.tPgKwe span.ogfYpf')
    return stops_element.text.strip() if stops_element else None

def scrape_departure_time(listing):
    departure_time_element = listing.select_one('span[jscontroller="cNtv4b"][aria-label^="Departure time"]')
    return departure_time_element['aria-label'].split(': ')[1] if departure_time_element else None

def scrape_arrival_time(listing):
    arrival_time_element = listing.select_one('span[jscontroller="cNtv4b"][aria-label^="Arrival time"]')
    return arrival_time_element['aria-label'].split(': ')[1] if arrival_time_element else None

def scrape_departure_location(listing):
    departure_location_element = listing.select_one('div.QylvBf span[jscontroller="cNtv4b"]')
    return departure_location_element.text.strip() if departure_location_element else None

def scrape_destination(listing):
    destination_elements = listing.select('div.QylvBf span[jscontroller="cNtv4b"]')
    return destination_elements[-1].text.strip() if destination_elements else None

def generate_url(date_str, base_url):
    return f"{base_url}&d={date_str}"

def scrape_route(route_config, writer):
    start_date = datetime(2024, 6, 1)
    end_date = datetime(2024, 12, 31)
    delta = timedelta(days=1)

    current_date = start_date
    while current_date <= end_date:
        date_str = current_date.strftime('%Y-%m-%d')
        url = generate_url(date_str, route_config['base_url'])

        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        listings = scrape_listings(soup)
        for listing in listings:
            company_name = scrape_company_name(listing)
            flight_duration = scrape_flight_duration(listing)
            price = scrape_price(listing)
            co2_emission = scrape_co2_emission(listing)
            stops = scrape_flight_stops(listing)
            departure_time = scrape_departure_time(listing)
            arrival_time = scrape_arrival_time(listing)
            departure_location = scrape_departure_location(listing)
            destination = scrape_destination(listing)

            writer.writerow({
                'date': date_str,
                'route': route_config['route_name'],
                'company_name': company_name,
                'flight_duration': flight_duration,
                'price': price,
                'co2_emission': co2_emission,
                'stops': stops,
                'departure_time': departure_time,
                'arrival_time': arrival_time,
                'departure_location': departure_location,
                'destination': destination
            })

        current_date += delta

def main():
    routes = [
        {
            'base_url': 'https://www.google.com/travel/flights/search?tfs=CBwQAhojEgoyMDI0LTA4LTI0agwIAhIIL20vMDljMTdyBwgBEgNJWENAAUgBcAGCAQsI____________AZgBAg',
            'route_name': 'BLR-IXC'
        },
        {
            'base_url': 'https://www.google.com/travel/flights/search?tfs=CBwQAhojEgoyMDI0LTA4LTI0agwIAhIIL20vMDljMTdyBwgBEgNKQUlAAUgBcAGCAQsI____________AZgBAg',
            'route_name': 'BLR-JAI'
        },
        {
            'base_url': 'https://www.google.com/travel/flights/search?tfs=CBwQAhooEgoyMDI0LTA4LTE4agwIAhIIL20vMDljMTdyDAgDEggvbS8wZGx2MEABSAFwAYIBCwj___________8BmAEC',
            'route_name': 'BLR-DEL'
        }
    ]

    with open('google_flights_data_year.csv', 'w', newline='') as csv_file:
        fieldnames = ['date', 'route', 'company_name', 'flight_duration', 'price', 'co2_emission', 'stops',
                      'departure_time', 'arrival_time', 'departure_location', 'destination']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()

        for route in routes:
            scrape_route(route, writer)

if __name__ == "__main__":
    main()


- I have scraped data for three routes:
 - BLR - DEL
 - BLR - JAI
 - BLR - IXC
- I will use this data to do comparative analysis on these.

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('google_flights_data_year.csv')

In [5]:
data.head(10)

Unnamed: 0,date,route,company_name,flight_duration,price,co2_emission,stops,departure_time,arrival_time,departure_location,destination
0,2024-06-01,BLR-IXC,IndiGo,3 hr 5 min,$95,135 kg CO2e,Nonstop,4:45 AM.,7:50 AM.,BLR,IXC
1,2024-06-01,BLR-IXC,IndiGoOperated by Operated Under Wet Lease Fro...,4 hr 45 min,$104,172 kg CO2e,1 stop,8:00 AM.,12:45 PM.,BLR,IXC
2,2024-06-01,BLR-IXC,IndiGo,2 hr 45 min,$127,131 kg CO2e,Nonstop,3:55 PM.,6:40 PM.,BLR,IXC
3,2024-06-01,BLR-IXC,IndiGo,2 hr 45 min,$127,159 kg CO2e,Nonstop,8:30 PM.,11:15 PM.,BLR,IXC
4,2024-06-01,BLR-IXC,Vistara,3 hr 5 min,,171 kg CO2e,Nonstop,11:30 AM.,2:35 PM.,BLR,IXC
5,2024-06-01,BLR-IXC,IndiGo,6 hr 5 min,$104,163 kg CO2e,1 stop,3:25 PM.,9:30 PM.,BLR,IXC
6,2024-06-01,BLR-IXC,IndiGo,5 hr 20 min,$107,166 kg CO2e,1 stop,1:00 PM.,6:20 PM.,BLR,IXC
7,2024-06-01,BLR-IXC,IndiGo,5 hr 30 min,$114,159 kg CO2e,1 stop,10:30 AM.,4:00 PM.,BLR,IXC
8,2024-06-01,BLR-IXC,IndiGo,3 hr 55 min,$139,162 kg CO2e,1 stop,7:40 AM.,11:35 AM.,BLR,IXC
9,2024-06-01,BLR-IXC,Air India,6 hr 35 min,,198 kg CO2e,1 stop,3:25 PM.,10:00 PM.,BLR,IXC


In [6]:
data.tail(10)

Unnamed: 0,date,route,company_name,flight_duration,price,co2_emission,stops,departure_time,arrival_time,departure_location,destination
13300,2024-12-31,BLR-DEL,IndiGo,3 hr,$120,122 kg CO2e,Nonstop,8:30 PM.,11:30 PM.,BLR,DEL
13301,2024-12-31,BLR-DEL,Air India,2 hr 50 min,$125,156 kg CO2e,Nonstop,11:45 AM.,2:35 PM.,BLR,DEL
13302,2024-12-31,BLR-DEL,Air India,3 hr 5 min,$125,150 kg CO2e,Nonstop,3:25 PM.,6:30 PM.,BLR,DEL
13303,2024-12-31,BLR-DEL,Air India,2 hr 50 min,$125,156 kg CO2e,Nonstop,5:15 PM.,8:05 PM.,BLR,DEL
13304,2024-12-31,BLR-DEL,Air India,2 hr 50 min,$125,156 kg CO2e,Nonstop,7:30 PM.,10:20 PM.,BLR,DEL
13305,2024-12-31,BLR-DEL,Air India,3 hr,$125,180 kg CO2e,Nonstop,8:55 PM.,11:55 PM.,BLR,DEL
13306,2024-12-31,BLR-DEL,Air India,2 hr 55 min,$202,173 kg CO2e,Nonstop,7:10 AM.,10:05 AM.,BLR,DEL
13307,2024-12-31,BLR-DEL,SpiceJet,2 hr 45 min,$255,,Nonstop,6:00 AM.,8:45 AM.,BLR,DEL
13308,2024-12-31,BLR-DEL,Vistara,2 hr 40 min,$350,154 kg CO2e,Nonstop,11:30 AM.,2:10 PM.,BLR,DEL
13309,2024-12-31,BLR-DEL,Air India,2 hr 50 min,$398,134 kg CO2e,Nonstop,1:20 PM.,4:10 PM.,BLR,DEL


In [7]:
data.shape

(13310, 11)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13310 entries, 0 to 13309
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   date                13310 non-null  object
 1   route               13310 non-null  object
 2   company_name        13310 non-null  object
 3   flight_duration     13310 non-null  object
 4   price               10341 non-null  object
 5   co2_emission        13071 non-null  object
 6   stops               13310 non-null  object
 7   departure_time      13310 non-null  object
 8   arrival_time        13310 non-null  object
 9   departure_location  13310 non-null  object
 10  destination         13310 non-null  object
dtypes: object(11)
memory usage: 1.1+ MB


In [9]:
data.describe()

Unnamed: 0,date,route,company_name,flight_duration,price,co2_emission,stops,departure_time,arrival_time,departure_location,destination
count,13310,13310,13310,13310,10341,13071,13310,13310,13310,13310,13310
unique,214,3,8,20,32,30,2,46,52,1,3
top,2024-10-17,BLR-DEL,IndiGo,2 hr 50 min,$112,122 kg CO2e,Nonstop,3:25 PM.,2:35 PM.,BLR,DEL
freq,102,8126,6411,4302,930,3107,10710,951,479,13310,8126


In [10]:
data.isnull().sum()

Unnamed: 0,0
date,0
route,0
company_name,0
flight_duration,0
price,2969
co2_emission,239
stops,0
departure_time,0
arrival_time,0
departure_location,0
