In [8]:
import requests
import pandas as pd
import re
import numpy as np
from tqdm import tqdm
from bs4 import BeautifulSoup

In [9]:
def webpage(page_number):
    return 'https://www.renthop.com/search/washington-dc?max_price=50000&min_price=0&page=' + str(page_number) + '&sort=hopscore&q=&search=0'

In [11]:
#This one gets all the pages with 20 listings on each page (must iterate over these)

pages = []

for page in tqdm(range(1,699)):
    url = webpage(str(page))
    pages.append(url)
print("done")


  0%|          | 0/698 [00:00<?, ?it/s][A
100%|██████████| 698/698 [00:00<00:00, 220138.67it/s]

done


[A

In [13]:
#This one creates a list with all the apartment listing links (the individual links for each apt)

duplinks = []
addressdups = []
latitudes = []
longitudes = []
listing_id = []
bed_baths = []
sqft = []
prices = []

for page in tqdm(pages):
    url = page
    r = requests.get(url)
    html_doc = r.text
    soup = BeautifulSoup(html_doc, 'html.parser')
    for element1 in soup.find_all('a', href=re.compile('^https://www.renthop.com/apartments/')):
        link = element1.get('href')
        duplinks.append(link)
    for element2 in soup.find_all('a', href=re.compile('^https://www.renthop.com/apartments/')):
        address_name = element2.get_text(strip=True)
        addressdups.append(address_name)
    for element3 in soup.find_all('div', latitude=re.compile('.')):
        lats = element3.get('latitude')
        latitudes.append(lats)
    for element4 in soup.find_all('div', longitude=re.compile('.')):
        longs = element4.get('longitude')
        longitudes.append(longs)
    for element5 in soup.find_all('div', listing_id=re.compile('.')):
        id_number = element5.get('listing_id')
        listing_id.append(id_number)
    for element6 in soup.find_all('td', attrs={"class":"font-size-10 bold"}): #this gets bed/bath
        bb = element6.get_text("|", strip=True)
        bb1 = bb.split("|")
        bed_baths.append(bb1)
    for element7 in soup.find_all('div', attrs={"style":"margin: 10px 0px 0px 0px;"}):
        area = element7.get_text(strip=True)
        area1 = area.replace(",", "")
        
        area2 = re.findall('\d+', area1)
        if len(area2) > 0:
            sqft.append(area2[0])
        else:
            sqft.append(float('nan'))
    for element8 in soup.find_all('td', attrs={"class":"font-size-11 bold color-fg-green"}):
        pr = element8.get_text(strip=True)
        prices.append(pr)
print("done")


  0%|          | 0/698 [00:00<?, ?it/s][A
  0%|          | 1/698 [00:00<11:31,  1.01it/s][A
  0%|          | 2/698 [00:01<10:46,  1.08it/s][A
  0%|          | 3/698 [00:02<09:49,  1.18it/s][A
  1%|          | 4/698 [00:03<09:16,  1.25it/s][A
  1%|          | 5/698 [00:04<10:20,  1.12it/s][A
  1%|          | 6/698 [00:05<12:51,  1.11s/it][A
  1%|          | 7/698 [00:07<14:07,  1.23s/it][A
  1%|          | 8/698 [00:08<13:25,  1.17s/it][A
  1%|▏         | 9/698 [00:09<11:42,  1.02s/it][A
100%|██████████| 698/698 [10:11<00:00,  1.47it/s]

done





In [14]:
#Parse the bed and beds to get a column for each

BB1 = [item[0] for item in bed_baths]

beds = BB1[0::2]
baths = BB1[1::2]

In [15]:
masterlistlinks = duplinks[1::2] #links of individual pages
address_titles = list(addressdups[1::2]) #title address of individual pages

In [17]:
#Create the dataframe

cols = [masterlistlinks, address_titles, latitudes, 
        longitudes, listing_id, sqft, prices, beds, baths]
column_names = ['listlinks', 'title_address', 'lats', 
                'longs', 'ids', 'sqft', 'rent_price', 'bedrooms', 'bathrooms']

df = pd.DataFrame(np.column_stack(cols), columns=column_names)

#Remove dollar sign, so eventually can use in machine learning model
df['rent_price'] = df['rent_price'].replace({'\$': '', ',': ''}, regex=True)

In [18]:
#Scraping an individual apartment's page:

all_amenities = []

df.set_index('listlinks', inplace=True)
df.loc[:, 'amenities'] = None # creating the "placeholder" for amenities

df.drop_duplicates(inplace=True)

for item in tqdm(masterlistlinks):
    
    item_amenities = []
    
    site = item
    r1 = requests.get(site)
    html_doc1 = r1.text
    sitesoup = BeautifulSoup(html_doc1, 'html.parser')
    for element in sitesoup.find_all('div', attrs={"style":"margin: 0px 0px 5px 0px;"}):
        amen = element.get_text(strip=True)
        item_amenities.append(amen)
    
    df.loc[item, 'amenities'] = item_amenities
    
    all_amenities += item_amenities
    
df.reset_index(inplace=True)    
print("done")

100%|██████████| 13652/13652 [2:01:03<00:00,  2.26it/s]   

done





In [20]:
#Turn all of the amenities into dummy columns. However, the duplicates must be removed

dum_col = list(set(all_amenities))

In [21]:
# Creates a placeholder for dummy variable column

for col in dum_col:
    df[col] = None
print('done')

done


In [22]:
# Creating the dummy variables for actual dataframe

row_index = 0

for feats in tqdm(df['amenities']): #gets individual list in feature column  
    try:
        for feature in feats: #gets individual element of each list in feature column
            colpos = df.columns.get_loc(feature) #gets the column where each element occurs
            df.iloc[row_index,colpos] = 1
    except Exception as e:
        print(e)
        print(nitem)    
    row_index += 1
    

100%|██████████| 11725/11725 [00:19<00:00, 616.29it/s]


In [23]:
df.loc[:, dum_col] = df.loc[:, dum_col].fillna(0) #fill missing values with 0

In [25]:
# Make all values in bedroom numeric -- Changed studio to 0.5
df['bedrooms'] = df['bedrooms'].replace('Studio', 0.5)

In [26]:
#Replace empty string with numpy NaN value
df = df.replace('', np.nan).replace('nan', np.nan)

In [None]:
df.head()

In [30]:
df.describe()

Unnamed: 0,Dogs Allowed,Cable/Satellite Ready,Art Deco,Mid rise,9 Ft. Ceilings,Upgraded Flooring,French Doors,Abundant shopping and dining,Basketball court,Spacious club lounge,...,Park-like setting,Military housing,Smoke Free Community,Eat-In Kitchens,Pet rent of $50,Lovely Rock Creek Park Views,24-Hour Front Desk Attendant,Vaulted Ceilings,Generous closets space,Handrails
count,11725.0,11725.0,11725.0,11725.0,11725.0,11725.0,11725.0,11725.0,11725.0,11725.0,...,11725.0,11725.0,11725.0,11725.0,11725.0,11725.0,11725.0,11725.0,11725.0,11725.0
mean,0.030362,0.000256,0.001535,0.004179,8.5e-05,0.000256,8.5e-05,0.000171,0.000512,0.000171,...,0.000256,0.000256,0.000256,0.000597,0.000171,0.000171,0.000171,0.000171,0.000171,0.000426
std,0.17159,0.015994,0.039153,0.064514,0.009235,0.015994,0.009235,0.01306,0.022617,0.01306,...,0.015994,0.015994,0.015994,0.024428,0.01306,0.01306,0.01306,0.01306,0.01306,0.020647
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [33]:
#Turn latitude and longitude columns into numeric values

df['longs'] = pd.to_numeric(df['longs'])
df['lats'] = pd.to_numeric(df['lats'])

In [34]:
df.head()

Unnamed: 0,listlinks,title_address,lats,longs,ids,sqft,rent_price,bedrooms,bathrooms,amenities,...,Park-like setting,Military housing,Smoke Free Community,Eat-In Kitchens,Pet rent of $50,Lovely Rock Creek Park Views,24-Hour Front Desk Attendant,Vaulted Ceilings,Generous closets space,Handrails
0,https://www.renthop.com/apartments/2400-pennsy...,2400 Pennsylvania Ave Nw,38.9029,-77.0518,17137566,,1630,0.5,1.0,[],...,0,0,0,0,0,0,0,0,0,0
1,https://www.renthop.com/apartments/4905-southl...,4905 Southland Ave,38.818,-77.1479,2007984,1160.0,1975,3.0,2.0,"[Featured, Cats Allowed, Dogs Allowed, 24 Hour...",...,0,0,0,0,0,0,0,0,0,0
2,https://www.renthop.com/apartments/9704-clark-...,9704 Clark Pl,38.7434,-77.4669,2059874,515.0,1102,1.0,1.5,"[Featured, Cats Allowed, Dogs Allowed, Accepts...",...,0,0,0,0,0,0,0,0,0,0
3,https://www.renthop.com/apartments/1002-kenneb...,1002 Kennebec Street,38.8194,-76.9922,3427733,989.0,1529,3.0,2.5,"[Featured, Cats Allowed, Dogs Allowed, Air con...",...,0,0,0,0,0,0,0,0,0,0
4,https://www.renthop.com/apartments/5601-regenc...,5601 Regency Park Court,38.8387,-76.9098,2096152,989.0,1299,2.0,1.5,"[Featured, Air conditioning, Business center o...",...,0,0,0,0,0,0,0,0,0,0


In [37]:
#Save DataFrame to Excel

from pandas import ExcelWriter
writer = ExcelWriter('Rent_Hop_DC2.xlsx')
df.to_excel(writer)
writer.save()