#Web Scrapper

In [None]:
class ENV:
    Local = 1
    Colab = 2
    def __init__(self):
      pass

DF_COLS = (['Date', 'Source','Destination','Airline',
            'Price'])


current_env = ENV.Colab # ENV.Local

if current_env == ENV.Colab :
  # !apt update
  !apt install chromium-chromedriver
  !pip install selenium

In [5]:
from selenium import webdriver
from bs4 import BeautifulSoup

def driversetup():
    if current_env == ENV.Colab:
        options = webdriver.ChromeOptions()
        #run Selenium in headless mode
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        #overcome limited resource problems
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument("lang=en")
        #open Browser in maximized mode
        options.add_argument("start-maximized")
        #disable infobars
        options.add_argument("disable-infobars")
        #disable extension
        options.add_argument("--disable-extensions")
        options.add_argument("--incognito")
        options.add_argument("--disable-blink-features=AutomationControlled")

        driver = webdriver.Chrome(options=options)

        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});")
    elif current_env == ENV.Local:
      chromedriver_path = r"C:\Users\Rahil Bhensdadia\Downloads\Optimization project\Web Scraper\chromedriver-win64\chromedriver.exe"
      chrome_options = webdriver.ChromeOptions() # maybe not needed
      chrome_services = webdriver.ChromeService(executable_path=chromedriver_path)
      driver = webdriver.Chrome(chrome_options,service=chrome_services)

    return driver

In [6]:
from time import sleep, strftime,time
from random import randint
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import smtplib
from email.mime.multipart import MIMEMultipart
from selenium.webdriver.common.by import By
import datetime
from openpyxl import Workbook

In [None]:

def page_scrape(driver,city_from, city_to, date_start):
    """This function takes care of the scraping part"""

    kayak_url = ('https://www.kayak.co.in/flights/' + city_from + '-' + city_to +
             '/' + date_start + '?sort=price_a&fs=airlines=-flylocal,SG,S5,MULT,X1,QP')
    print(kayak_url)
    driver.get(kayak_url)
    sleep(30)

    # site_html = BeautifulSoup(driver.page_source,features="html.parser").prettify()
    # file_path = r"C:\Users\Rahil Bhensdadia\Downloads\Optimization\Optimization project\Scheduling_IPL\Web Scraper\site_html.txt"
    # with open(file_path, "w", encoding='utf-8') as file:
    #     file.write(site_html)

    # getting the prices
    cnm_prices = 'f8F1-price-text'
    prices = driver.find_elements(By.CLASS_NAME,cnm_prices, )
    prices = prices[:3]
    prices_list = [price.text.replace('₹','').replace(',','') for price in prices if price.text != '']
    prices_list = list(map(int, prices_list))
    print("Prices :",len(prices_list))

    careers = driver.find_elements(By.CLASS_NAME,"J0g6-operator-text")
    a_carrier = [career.text for career in careers]


    flights_df = pd.DataFrame({'Date':date_start,
                               'Source': city_from,
                               'Destination':city_to,
                               'Airline': a_carrier,
                               'Price': prices_list} )[DF_COLS]

    flights_df['data timestamp'] = strftime("%Y%m%d-%H%M") # so we can know when it was scraped

    return flights_df


In [None]:
def start_kayak(driver,venues,match_dates):
    print('Finding cheapest results.....')
    df_flights_cheap = pd.DataFrame(columns=DF_COLS)

    # looping for all des src combi for 54 days

    for si in range(4,len(venues)):
        workbook = Workbook()
        workbook.save('FlightData_{}.xlsx'.format(venues[si]))
        for des_i in range(si+1,len(venues)):
            df_flights_cheap_bucket = pd.DataFrame(columns=DF_COLS)
            for date_i in range(len(match_dates)):
                start = time()
                src = venues[si]
                des = venues[des_i]
                flight_date = match_dates[date_i].strftime("%Y-%m-%d")
                flt_data = page_scrape(driver,src, des, flight_date)
                df_flights_cheap_bucket = pd.concat([df_flights_cheap_bucket, flt_data], axis=0)
                end = time()
                print(end-start,"Seconds elaspes for last record")

            with pd.ExcelWriter('FlightData_{}.xlsx'.format(venues[si]), mode='a', if_sheet_exists='overlay') as writer:
                df_flights_cheap_bucket.to_excel(writer, sheet_name=f"{venues[des_i]}", index=False)
            df_flights_cheap = pd.concat([df_flights_cheap,df_flights_cheap_bucket], axis=0)

    df_flights_cheap.to_excel('FlightData_{}.xlsx'.format(strftime("%Y%m%d-%H%M")), index=False)
    print('saved df.....')

In [None]:

'''
"Chennai" : "MAA",
"Kolkata":"CCU",
"Mohali Chandigard":"IXC",
"Banglore":"BLR",
"Dharamsala":"DHM",
"Jaipur":"JAI"
'''

driver = driversetup()
venues = ["DEL","BOM","HYD","MAA","CCU","IXC","BLR","DHM","JAI"]

base_date = datetime.datetime(year=2024,month = 5, day=24)
match_dates = [(base_date+datetime.timedelta(days=i)).date() for i in range(54)] #54
start_kayak(driver,venues, match_dates)

driver.save_screenshot('pythonscraping.png')

driver.close()

Finding cheapest results.....
https://www.kayak.co.in/flights/CCU-IXC/2024-05-24?sort=price_a&fs=airlines=-flylocal,SG,S5,MULT,X1,QP
Prices : 0
31.0479953289032 Seconds elaspes for last record
https://www.kayak.co.in/flights/CCU-IXC/2024-05-25?sort=price_a&fs=airlines=-flylocal,SG,S5,MULT,X1,QP
Prices : 0
30.717918872833252 Seconds elaspes for last record
https://www.kayak.co.in/flights/CCU-IXC/2024-05-26?sort=price_a&fs=airlines=-flylocal,SG,S5,MULT,X1,QP
Prices : 0
30.963717222213745 Seconds elaspes for last record
https://www.kayak.co.in/flights/CCU-IXC/2024-05-27?sort=price_a&fs=airlines=-flylocal,SG,S5,MULT,X1,QP
Prices : 0
30.734219789505005 Seconds elaspes for last record
https://www.kayak.co.in/flights/CCU-IXC/2024-05-28?sort=price_a&fs=airlines=-flylocal,SG,S5,MULT,X1,QP
Prices : 0
30.822548627853394 Seconds elaspes for last record
https://www.kayak.co.in/flights/CCU-IXC/2024-05-29?sort=price_a&fs=airlines=-flylocal,SG,S5,MULT,X1,QP
Prices : 0
30.681247234344482 Seconds elaspe

#Hotel Prices

In [16]:
DF_HOTEL_COLS = (['Date', 'Venue','Hotel Name',
            'Price'])

In [22]:
def get_prices(driver,venue, checkin_date, checkout_date):
    """This function takes care of the scraping part"""

    kayak_url = ('https://www.kayak.co.in/hotels/' + venue +
             '/' + checkin_date + '/' + checkout_date + '/2adults;map?sort=price_a&fs=stars=5;hotelchain=br-146,br-411')
    print(kayak_url)
    driver.get(kayak_url)
    sleep(30)

    # site_html = BeautifulSoup(driver.page_source,features="html.parser").prettify()
    # file_path = r"C:\Users\Rahil Bhensdadia\Downloads\Optimization\Optimization project\Scheduling_IPL\Web Scraper\site_html.txt"
    # with open(file_path, "w", encoding='utf-8') as file:
    #     file.write(site_html)

    # getting the prices
    cnm_prices = 'D8J--price'
    prices = driver.find_elements(By.CLASS_NAME,cnm_prices)[:10]

    prices_list = [price.text.replace('₹','').replace(',','') for price in prices if price.text != '']
    prices_list = list(map(int, prices_list))
    # print("Prices :",len(prices_list))
    # flights_times = driver.find_elements(By.CLASS_NAME,"VY2U")
    # flights_time = [stop[0].text for stop in flights_times] ############################

    hotel_elements = driver.find_elements(By.CLASS_NAME,"IirT-header")[:10]
    hotel_names = [career.text for career in hotel_elements]

    #print(checkin_date,venue,len(hotel_names),len(prices_list))
    hotels_df = pd.DataFrame({'Date':checkin_date,
                               'Venue': venue,
                               'Hotel Name': hotel_names,
                               'Price': prices_list} )[DF_HOTEL_COLS]

    hotels_df['data timestamp'] = strftime("%Y%m%d-%H%M") # so we can know when it was scraped

    return hotels_df

In [23]:
def start_hotel_scraping(driver,venues,match_dates):
    print('Finding cheapest results.....')
    df_hotel = pd.DataFrame(columns=DF_COLS)

    # looping for all venues combi for 54 days

    for si in range(len(venues)):
        df_hotel_bucket = pd.DataFrame(columns=DF_COLS)
        workbook = Workbook()
        workbook.save('.\datastore\HotelData_{}.xlsx'.format(venues[si]))
        for date_i in range(len(match_dates)-1):
            start = time()
            venue = venues[si]
            checkin_date = match_dates[date_i].strftime("%Y-%m-%d")
            checkout_date = match_dates[date_i+1].strftime("%Y-%m-%d")
            # print(src,des,flight_date)
            hotel_data = get_prices(driver,venue, checkin_date,checkout_date)
            df_hotel_bucket = pd.concat([df_hotel_bucket, hotel_data], axis=0)
            end = time()
            print(end-start,"Seconds elaspes for last record")

        with pd.ExcelWriter('.\datastore\HotelData_{}.xlsx'.format(venues[si]), mode='a', if_sheet_exists='overlay') as writer:
                df_hotel_bucket.to_excel(writer, sheet_name=f"{venues[si]}", index=False)

        df_hotel_bucket.to_excel('.\datastore\HotelData_{}.xlsx'.format(venues[si]), index=False)
        df_hotel = pd.concat([df_hotel,df_hotel_bucket], axis=0)

    df_hotel.to_excel('.\datastore\HotelData_{}.xlsx'.format(strftime("%Y%m%d-%H%M")), index=False)
    print('saved df.....')

In [None]:
driver = driversetup()
venues = ["DEL","BOM","HYD","MAA","CCU","IXC","BLR","DHM","JAI"]

venues_directore = {
                    "DEL":"New-Delhi,National-Capital-Territory-of-India,India-p14282",
                    "BOM":"Mumbai,Maharashtra,India-p15926",
                    "HYD":"Hyderabad,Telangana,India-p15297",
                    "MAA":"Chennai,Tamil-Nadu,India-p14679",
                    "CCU":"Kolkata,West-Bengal,India-p15904",
                    "IXC":"Chandigarh,India-p15870",
                    "BLR":"Bengaluru,Karnataka,India-p16183",
                    "DHM" : "Dharamshala,Himachal-Pradesh,India-p15662",
                    "JAI" : "Jaipur,Rajasthan,India-p15247"
                    }
base_date = datetime.datetime(year=2024,month = 5, day=24)
MATCH_DAYS = 54
match_dates = [(base_date+datetime.timedelta(days=i)).date() for i in range(MATCH_DAYS+1)]
start_hotel_scraping(driver,list(venues_directore.values()), match_dates)

driver.close()

Finding cheapest results.....
https://www.kayak.co.in/hotels/New-Delhi,National-Capital-Territory-of-India,India-p14282/2024-05-24/2024-05-25/2adults;map?sort=price_a&fs=stars=5;hotelchain=br-146,br-411
31.405624628067017 Seconds elaspes for last record
https://www.kayak.co.in/hotels/New-Delhi,National-Capital-Territory-of-India,India-p14282/2024-05-25/2024-05-26/2adults;map?sort=price_a&fs=stars=5;hotelchain=br-146,br-411
30.81840968132019 Seconds elaspes for last record
https://www.kayak.co.in/hotels/New-Delhi,National-Capital-Territory-of-India,India-p14282/2024-05-26/2024-05-27/2adults;map?sort=price_a&fs=stars=5;hotelchain=br-146,br-411
30.978936672210693 Seconds elaspes for last record
https://www.kayak.co.in/hotels/New-Delhi,National-Capital-Territory-of-India,India-p14282/2024-05-27/2024-05-28/2adults;map?sort=price_a&fs=stars=5;hotelchain=br-146,br-411
30.745112657546997 Seconds elaspes for last record
https://www.kayak.co.in/hotels/New-Delhi,National-Capital-Territory-of-Indi