In this notebook, we scrape rented apartments from airbnb and carpages.ca and we will deal with multiple pages.

# airbnb

In [42]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep

In [55]:
text = requests.get('https://www.airbnb.com/s/Honolulu--HI--United-States/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_lengths%5B%5D=one_week&query=Honolulu%2C%20HI%2C%20USA&place_id=ChIJTUbDjDsYAHwRbJen81_1KEs&date_picker_type=calendar&checkin=2022-06-22&checkout=2022-06-29&source=structured_search_input_header&search_type=user_map_move&ne_lat=21.284659962868464&ne_lng=-157.7829019882003&sw_lat=21.239866562829675&sw_lng=-157.84943793895508&zoom=14&search_by_map=true').text
soup = BeautifulSoup(text, 'lxml')

In [56]:
columns = ['title', 'night', 'total', 'bed', 'rating']
df = pd.DataFrame(columns=columns)

table = soup.find_all(class_='c4mnd7m dir dir-ltr') 
while True:

    
    for post in table:
        sleep(0.1)
        title = post.find('meta', {'itemprop':'name'}).get('content')
        night = post.find(class_='_tyxjp1').text
        total = post.find(class_='_tt122m').text
        bed = post.find(class_='f15liw5s s1cjsi4j dir dir-ltr').text
        rating = post.find(class_='ru0q88m dir dir-ltr').text
        
        new_index = len(df)
        df.loc[new_index] = [title, night, total, bed, rating]  
    
    main = 'https://www.airbnb.com'
    sleep(0.5)
    try:
        next_url = soup.find('a', {'aria-label':'Next'}).get('href')
    except AttributeError:
        print('It is done!')
        break
    url = main + next_url
    
    
    text = requests.get(url).text
    soup = BeautifulSoup(text, 'lxml')
    
    
    
        
        
        

It is done!


In [193]:
df_copy = df

Now remove the dollar symbol and 'total' from night and total columns, and convert night, total, and rating columns into float

In [194]:
def convert(entry):
    remove = ['$', 'total', ',']
    for item in remove:
        entry = entry.replace(item, '')
        entry = entry.replace('New', 0)
    return float(entry)

In [196]:
clean = ['night', 'total', 'rating']
for col in clean:
    df[col] = df[col].apply(lambda x: convert(x))

df

Unnamed: 0,title,night,total,bed,rating
0,"Waikiki Shore 515 (elegant upgraded) ""oceanvi...",206.0,1570.0,1 bed,4.71
1,"Hale Paradiso|Stylish, Boho Studio by the Ocean",234.0,1779.0,2 beds,New
2,🤙WAIKIKI BEACH-Legal Rental🤙,152.0,1167.0,1 queen bed,4.68
3,Waikiki Grand Hotel with Private Balcony!,148.0,1186.0,1 bed,New
4,"Pool View, 1 King Bed OR 2 Doubles, Pagoda Wai...",112.0,833.0,1 king bed,4.29
...,...,...,...,...,...
295,Heart of Waikiki Ocean View 4U!,146.0,1193.0,,4.6
296,Waikiki Partial Ocean Kitchenette w/ 1 Queen Bed,218.0,1523.0,2 beds,4.48
297,Perfect for Pairs! Modern Beach Studio w/Kitch...,283.0,2173.0,1 king bed,4.85
298,Bamboo Hotel - Privately owned Queen Studio wi...,149.0,1193.0,1 queen bed,4.5


In [199]:
df.describe()

Unnamed: 0,night,total
count,300.0,300.0
mean,180.35,1391.5
std,55.62681,410.180664
min,82.0,673.0
25%,147.5,1159.75
50%,158.5,1244.0
75%,222.75,1742.25
max,283.0,2173.0


In [224]:
# Find the place with the highest rating
df[df.rating != 'New'].sort_values(by='rating', ascending=False).head(1)

Unnamed: 0,title,night,total,bed,rating
266,THE TIKI SUITE at WAIKIKI BANYAN,264.0,2000.0,2 beds,4.93


In [216]:
df[df.rating != 'New']

Unnamed: 0,title,night,total,bed,rating
0,"Waikiki Shore 515 (elegant upgraded) ""oceanvi...",206.0,1570.0,1 bed,4.71
2,🤙WAIKIKI BEACH-Legal Rental🤙,152.0,1167.0,1 queen bed,4.68
4,"Pool View, 1 King Bed OR 2 Doubles, Pagoda Wai...",112.0,833.0,1 king bed,4.29
5,Female only Waikiki Ocean View One Bedroom,82.0,673.0,2 queen beds,4.6
6,THE TIKI SUITE at WAIKIKI BANYAN,264.0,2000.0,2 beds,4.93
...,...,...,...,...,...
295,Heart of Waikiki Ocean View 4U!,146.0,1193.0,,4.6
296,Waikiki Partial Ocean Kitchenette w/ 1 Queen Bed,218.0,1523.0,2 beds,4.48
297,Perfect for Pairs! Modern Beach Studio w/Kitch...,283.0,2173.0,1 king bed,4.85
298,Bamboo Hotel - Privately owned Queen Studio wi...,149.0,1193.0,1 queen bed,4.5


In [213]:
df.rating[0]

'4.71'

# carpages.ca

Now, crawl a Canadian car dealer website called carpages.ca and pull the data about cars, including its price, model, and the link where we can buy the specified model.

In [325]:
text = requests.get('https://www.carpages.ca/used-cars/search/?fueltype_id%5B0%5D=3&fueltype_id%5B1%5D=7').text
soup = BeautifulSoup(text, 'lxml')

columns = ['model', 'price', 'color', 'km', 'sales', 'address', 'link']
car_df = pd.DataFrame(columns=columns)

while len(car_df) < 2500:
    tables = soup.find_all(class_='media soft push-none rule')
    for post in tables:
            model = post.find(class_='hN').find('a').get('title')
            link = 'https://www.carpages.ca/' + post.find(class_='hN').find('a').get('href')
            price = post.find('strong', class_='delta').text.strip().replace('$', '')
            if 'CALL' in price: # so that we have cars whose prices are specified (or null)
                break
            color = post.find_all(class_='grey l-column l-column--small-6 l-column--medium-4')[1].text.strip()
            km = post.find_all(class_='grey l-column l-column--small-6 l-column--medium-4')[0].text.strip()
            sales = post.find_all(class_='vehicle__card--dealerInfo')[0].h5.text
            address = post.find_all(class_='vehicle__card--dealerInfo')[0].p.text
            df_index = len(car_df)
            car_df.loc[df_index] = [model, price, color, km, sales, address, link]
    
    url = 'https://www.carpages.ca/'
    try:
        new_path = soup.find_all(class_='nextprev')[-1].get('href')
    except AttributeError:
        print('It is done!')
        break
    new_url = url + new_path
    
    text = requests.get(new_url).text
    soup = BeautifulSoup(text, 'lxml')
    
        
        

In [327]:
car_df.to_csv(r'C:\Users\Lenovo\Desktop\datasets\carpages.csv')

In [322]:
car_df

Unnamed: 0,model,price,color,km,sales,address,link
0,2022 Audi Q8 TECHNIK,,Glacier White Metallic,90KM,Audi Winnipeg,"Winnipeg, MB",https://www.carpages.ca//new-cars/manitoba/win...
1,2022 Audi A5 Sportback Technik,,Ibis White,90KM,Audi Winnipeg,"Winnipeg, MB",https://www.carpages.ca//new-cars/manitoba/win...
2,2022 Audi A3 Sedan Technik,,Mythos Black Metallic,90KM,Audi Winnipeg,"Winnipeg, MB",https://www.carpages.ca//new-cars/manitoba/win...
3,2018 Porsche Panamera 4,,JET BLACK METALLIC,"14,000KM",Steele Auto Group,"Halifax, NS",https://www.carpages.ca//used-cars/nova-scotia...
4,2022 Ford Transit Cargo Van,,Oxford White,16KM,Kelleher Ford,"Brandon, MB",https://www.carpages.ca//new-cars/manitoba/bra...
...,...,...,...,...,...,...,...
2525,2020 Ford Escape Titanium Hybrid,,Gray,"32,180KM",Lincoln Heights Ford,"Ottawa, ON",https://www.carpages.ca//used-cars/ontario/ott...
2526,2022 RAM 1500,,Silver,CALL,Columbia Chrysler,"Richmond, BC",https://www.carpages.ca//new-cars/british-colu...
2527,2022 Land Rover Range Rover SPORT,,,2KM,Jaguar-Land Rover Edmonton,"Edmonton, AB",https://www.carpages.ca//new-cars/alberta/edmo...
2528,2022 Kia Sorento HEV EX,,Runway Red,90KM,Plaza Kia,"Richmond Hill, ON",https://www.carpages.ca//new-cars/ontario/rich...
