# Web scrape LA Apartments

In [1]:
#import get to call a get request on the site
from requests import get

#get the first page of the east bay housing prices
response = get('https://losangeles.craigslist.org/search/lgb/apa') #get rid of those lame-o's that post a housing option without a pic using their filter

from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')

#get the macro-container for the housing posts
posts = html_soup.find_all('li', class_= 'result-row')
print(type(posts)) #to double check that I got a ResultSet
print(len(posts)) #to double check I got 120 (elements/page)

<class 'bs4.element.ResultSet'>
120


In [2]:
#grab the first post
post_one = posts[0]

In [3]:
#grab the price of the first post
post_one_price = post_one.a.text
post_one_price.strip()

'$1,750'

In [4]:
#grab the time of the post in datetime format to save on cleaning efforts
post_one_time = post_one.find('time', class_= 'result-date')
post_one_datetime = post_one_time['datetime']

In [5]:
#title is a and that class, link is grabbing the href attribute of that variable
post_one_title = post_one.find('a', class_='result-title hdrlnk')
post_one_link = post_one_title['href']

#easy to grab the post title by taking the text element of the title variable
post_one_title_text = post_one_title.text

In [6]:
#grabs the whole segment of housing details. We will need missing value handling in the loop as this kind of detail is not common in posts
#the text can be split, and we can use indexing to grab the elements we want. number of bedrooms is the first element.
#sqft is the third element

post_one_num_bedrooms = post_one.find('span', class_ = 'housing').text.split()[0]


In [7]:
#the neighborhood is grabbed by finding the span class 'result-hood' and pulling the text element from that
post_one_hood = posts[0].find('span', class_='result-hood').text

In [25]:
#build out the loop
from time import sleep
import re
from random import randint #avoid throttling by not sending too many requests one after the other
from warnings import warn
from time import time
from IPython.core.display import clear_output
import numpy as np

#find the total number of posts to find the limit of the pagination
results_num = html_soup.find('div', class_= 'search-legend')
results_total = int(results_num.find('span', class_='totalcount').text) #pulled the total count of posts as the upper bound of the pages array

#each page has 119 posts so each new page is defined as follows: s=120, s=240, s=360, and so on. So we need to step in size 120 in the np.arange function
pages = np.arange(0, results_total+1, 120)

iterations = 0

post_timing = []
post_hoods = []
post_title_texts = []
bedroom_counts = []
sqfts = []
post_links = []
post_prices = []

for page in pages:
    
    #get request
    response = get("https://losangeles.craigslist.org/search/lgb/apa?" 
                   + "s=" #the parameter for defining the page number 
                   + str(page) #the page number in the pages array from earlier
                 #  + "&hasPic=1"
                  # + "&availabilityMode=0"
                  )

    sleep(randint(1,5))
     
    #throw warning for status codes that are not 200
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))
        
    #define the html text
    page_html = BeautifulSoup(response.text, 'html.parser')
    
    #define the posts
    posts = page_html.find_all('li', class_= 'result-row')
        
    #extract data item-wise
    for post in posts:

        if post.find('span', class_ = 'result-hood') is not None:

            #posting date
            #grab the datetime element 0 for date and 1 for time
            post_datetime = post.find('time', class_= 'result-date')['datetime']
            post_timing.append(post_datetime)

            #neighborhoods
            post_hood = post.find('span', class_= 'result-hood').text
            post_hoods.append(post_hood)

            #title text
            post_title = post.find('a', class_='result-title hdrlnk')
            post_title_text = post_title.text
            post_title_texts.append(post_title_text)

            #post link
            post_link = post_title['href']
            post_links.append(post_link)
            
            #removes the \n whitespace from each side, removes the currency symbol, and turns it into an int
            post_price = post.a.text.strip().replace("$", "")
            post_prices.append(post_price)
            
            if post.find('span', class_ = 'housing') is not None:
                
                #if the first element is accidentally square footage
                if 'ft2' in post.find('span', class_ = 'housing').text.split()[0]:
                    
                    #make bedroom nan
                    bedroom_count = np.nan
                    bedroom_counts.append(bedroom_count)
                    
                    #make sqft the first element
                    sqft = int(post.find('span', class_ = 'housing').text.split()[0][:-3])
                    sqfts.append(sqft)
                    
                #if the length of the housing details element is more than 2
                elif len(post.find('span', class_ = 'housing').text.split()) > 2:
                    
                    #therefore element 0 will be bedroom count
                    bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                    bedroom_counts.append(bedroom_count)
                    
                    #and sqft will be number 3, so set these here and append
                    sqft = int(post.find('span', class_ = 'housing').text.split()[2][:-3])
                    sqfts.append(sqft)
                    
                #if there is num bedrooms but no sqft
                elif len(post.find('span', class_ = 'housing').text.split()) == 2:
                    
                    #therefore element 0 will be bedroom count
                    bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                    bedroom_counts.append(bedroom_count)
                    
                    #and sqft will be number 3, so set these here and append
                    sqft = np.nan
                    sqfts.append(sqft)                    
                
                else:
                    bedroom_count = np.nan
                    bedroom_counts.append(bedroom_count)
                
                    sqft = np.nan
                    sqfts.append(sqft)
                
            #if none of those conditions catch, make bedroom nan, this won't be needed    
            else:
                bedroom_count = np.nan
                bedroom_counts.append(bedroom_count)
                
                sqft = np.nan
                sqfts.append(sqft)
            #    bedroom_counts.append(bedroom_count)
                
            #    sqft = np.nan
            #    sqfts.append(sqft)
                
    iterations += 1
    print("Page " + str(iterations) + " scraped successfully!")

print("\n")

print("Scrape complete!")

  from IPython.core.display import clear_output


Page 1 scraped successfully!
Page 2 scraped successfully!
Page 3 scraped successfully!
Page 4 scraped successfully!
Page 5 scraped successfully!
Page 6 scraped successfully!
Page 7 scraped successfully!
Page 8 scraped successfully!
Page 9 scraped successfully!
Page 10 scraped successfully!
Page 11 scraped successfully!
Page 12 scraped successfully!
Page 13 scraped successfully!
Page 14 scraped successfully!
Page 15 scraped successfully!
Page 16 scraped successfully!
Page 17 scraped successfully!
Page 18 scraped successfully!
Page 19 scraped successfully!
Page 20 scraped successfully!


Scrape complete!


In [28]:
count = 0
l1=[]
for item in post_timing:
    if item not in l1:
        count += 1
        l1.append(item)
        
print("No of unique items are:", count)

No of unique items are: 2158


In [26]:
import pandas as pd

lgb_apts = pd.DataFrame({'posted': post_timing,
                       'neighborhood': post_hoods,
                       'post title': post_title_texts,
                       'number bedrooms': bedroom_counts,
                        'sqft': sqfts,
                        'URL': post_links,
                       'price': post_prices})
lgb_apts.head()

Unnamed: 0,posted,neighborhood,post title,number bedrooms,sqft,URL,price
0,2022-05-22 22:54,(long beach / 562),"2/bd, Controlled-entry access, Long Beach CA",2,1075.0,https://losangeles.craigslist.org/lgb/apa/d/lo...,3241
1,2022-05-22 22:17,(long beach / 562),"2bd 2ba, Resident social lounge, Controlled-en...",2,971.0,https://losangeles.craigslist.org/lgb/apa/d/lo...,3038
2,2022-05-22 22:09,"(8121 Broadway Blvd, Los Angeles, CA)","Wall A/C units, Cable Ready, Intercom system",2,811.0,https://losangeles.craigslist.org/lgb/apa/d/wh...,2420
3,2022-05-22 21:34,"(5910 Orange Ave, Apt.# D, Long Beach, CA 90805)",Will work with bad credit - 1 br/ 1 ba,1,,https://losangeles.craigslist.org/lgb/apa/d/lo...,1750
4,2022-05-22 21:31,(long beach / 562),1 BED 1 BATH MOVE IN READY,1,700.0,https://losangeles.craigslist.org/lgb/apa/d/no...,1891


In [27]:
lgb_apts['post title'].nunique()

1626

In [13]:
la_apts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3120 entries, 0 to 3119
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   posted           3120 non-null   object 
 1   neighborhood     3120 non-null   object 
 2   post title       3120 non-null   object 
 3   number bedrooms  2366 non-null   object 
 4   sqft             2340 non-null   float64
 5   URL              3120 non-null   object 
 6   price            3120 non-null   object 
dtypes: float64(1), object(6)
memory usage: 170.8+ KB


In [30]:
# #import requests
# def get_latlong(url):
#     r = requests.get(url)
#     soup = BeautifulSoup(r.content)
#     if 'data-latitude' in r.text:
#    # link = soup.find('div', class_='viewposting')
#     lat = link['data-latitude']
#     lon = link['data-longitude']
#     return [lat, lon]
#     #else:
#       #  return [None, None]

# #la_apts['lat'] = la_apts.head(10).URL.apply(get_latlong)

In [23]:
la_apts.head(10)

Unnamed: 0,posted,neighborhood,post title,number bedrooms,sqft,URL,price,latlon,lat
0,2022-05-18 12:56,(Los Angeles),2 BR/1BR cottage house for rent,2.0,,https://losangeles.craigslist.org/lac/apa/d/lo...,2600,,"[None, None]"
1,2022-05-18 12:56,"(400 S Main St Los Angeles, CA)",Be Ready To Be Wowed. Huge Urban Loft. Great C...,,1420.0,https://losangeles.craigslist.org/lac/apa/d/lo...,3230,,"[None, None]"
2,2022-05-18 12:56,(Hollywood),Spacious 2 Bedroom/2 Bath ~ Coming June!,2.0,1131.0,https://losangeles.craigslist.org/lac/apa/d/lo...,3327,,"[None, None]"
3,2022-05-18 12:54,"(909 W. Temple St., Los Angeles, CA)",City view and plenty of amenities!!!,2.0,948.0,https://losangeles.craigslist.org/lac/apa/d/lo...,2995,,"[None, None]"
4,2022-05-18 12:49,(North Westlake),"Wood Inspired Floors, Gym, Newly Built 1 Bedro...",1.0,1075.0,https://losangeles.craigslist.org/lac/apa/d/lo...,2300,,"[None, None]"
5,2022-05-18 12:48,(CSUN / Northridge / Granada Hills),Start Summer off in your new home!,2.0,1185.0,https://losangeles.craigslist.org/lac/apa/d/no...,2885,,"[None, None]"
6,2022-05-18 12:48,(Los Angeles),Preleasing Available for move-in June 4th 2022,,,https://losangeles.craigslist.org/lac/apa/d/lo...,1295,,"[None, None]"
7,2022-05-18 12:48,(CSUN / Northridge / Granada Hills),One Bedroom Coming Available Soon! Great Locat...,1.0,837.0,https://losangeles.craigslist.org/lac/apa/d/no...,2310,,"[None, None]"
8,2022-05-18 12:47,(Los Angeles),Vinyl Plank Floors || Renovated 2+1 in West Ad...,2.0,1000.0,https://losangeles.craigslist.org/lac/apa/d/lo...,2350,,"[None, None]"
9,2022-05-18 12:47,(Highland Park/Eagle Rock),Brand New 1 Bed 1.5 BA in Eagle Rock Adj*Reces...,1.0,744.0,https://losangeles.craigslist.org/lac/apa/d/lo...,3000,,"[None, None]"


In [7]:
lgb_apts.to_csv('Data/lgb_listings.csv')