# AirBnB Scrape Images with Selenium Web Driver

Prerequisites:


*   CSV file (`listings.csv`) containing all AirBnB Listing IDs to scrape



In [None]:
!pip install kora -q

import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
import bs4 as soup
from kora.selenium import wd
import csv

# Preparation

Load Listing IDs into array

In [None]:
with open('listings.csv', newline='') as f:
    id_list = []
    for row in csv.reader(f, delimiter=';'):
        (id_list.append(row))

# Image Scraping

(**NOTE:** This will take several hours!)

In [9]:
# set implicit wait for element
wd.implicitly_wait(3)
# AirBnB base url to scrape images from
airbnb_base_url = "https://www.airbnb.co.in/rooms/"

# image scraping
airbnb_images = dict({})

for id in id_list:
  url = airbnb_base_url + str(id)
  wd.get(url)
  # we look for the "source" tag which contains the images
  sources = wd.find_elements_by_css_selector("source")

  image_urls = []
  if sources:
    for source in sources:
      # get image urls
      try:
        img_url = source.get_attribute('srcset')
      except:
        break
      if img_url != None:
        img_url = img_url[:76]
        image_urls.append(img_url)
      # remove duplicates
      image_urls = list(set(image_urls))
    # add to dict
    airbnb_images[id] = image_urls

Save to CSV file

In [11]:
with open('output.csv', 'w') as f:
    writer = csv.writer(f)
    for (key, val) in airbnb_images.items():
        writer.writerow([key, ",".join(val)])

# Cleanup

Load into dataframe and inspect

In [None]:
df_img = pd.read_csv("output.csv") 

In [None]:
df_img.describe()

In [None]:
df_img.head()

removing quotes and splitting links by spacing

In [None]:
df_img['id'] = df_img['id'].str.replace(r',https', ' https')
df_img['id'] = df_img['id'].str.replace(r'"', '')

In [None]:
df_img.head()

row correction (before it was not in an appropriate CSV format)

In [None]:
df_img[['id', 'images']] = df_img['id'].str.split(pat=',', expand=True)

In [None]:
df_img.head()

output CSV file

In [None]:
df_img.to_csv('nycairbnb_imgs.csv', index=False)