In [13]:
ymd  = '20250615'
city = 'London'
host = 'https://orca.casa.ucl.ac.uk'
url  = f'{host}/~jreades/data/{ymd}-{city}-listings.csv.gz'

In [14]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

from pathlib import Path
from requests import get
from functools import wraps

def check_cache(f):
    @wraps(f)
    def wrapper(src:str, dst:str, min_size=100) -> Path:
        if src.find('?') == -1:
            url = Path(src)
        else:
            url = Path(src[:src.find('?')])
        fn  = url.name  # Extract the filename
        dsn = Path(f"{dst}/{fn}") #Â Destination filename
        if dsn.is_file() and dsn.stat().st_size > min_size:
            print(f"+ {dsn} found locally!")
            return(dsn)
        else:
            print(f"+ {dsn} not found, downloading!")
            return(f(src, dsn))
    return wrapper

@check_cache
def cache_data(src:Path, dst:Path) -> str:
    """Downloads a remote file.
    
    The function sits between the 'read' step of a pandas or geopandas
    data frame and downloading the file from a remote location. The idea
    is that it will save it locally so that you don't need to remember to
    do so yourself. Subsequent re-reads of the file will return instantly
    rather than downloading the entire file for a second or n-th itme.
    
    Parameters
    ----------
    src : str
        The remote *source* for the file, any valid URL should work.
    dst : str
        The *destination* location to save the downloaded file.
        
    Returns
    -------
    str
        A string representing the local location of the file.
    """
      
    # Create any missing directories in dest(ination) path
    # -- os.path.join is the reverse of split (as you saw above)
    # but it doesn't work with lists... so I had to google how
    # to use the 'splat' operator! os.makedirs creates missing
    # directories in a path automatically.
    if not dst.parent.exists():
        dst.parent.mkdir(parents=True, exist_ok=True)
        
    # Download and write the file
    with dst.open(mode='wb') as file:
        response = get(src)
        file.write(response.content)
        
    print(' + Done downloading...')

    return dst.resolve()

In [12]:
ddir = Path("data/raw")                  # use relative path
listings_file = ddir / f"{ymd}-{city}-listings.csv.gz"   # MUST match remote extension

In [15]:
#  Download + Load CSV into DataFrame
listings_path = cache_data(url, listings_file)

+ data/raw/20250615-London-listings.csv.gz/20250615-London-listings.csv.gz not found, downloading!
 + Done downloading...


In [16]:
listings = pd.read_csv(listings_path)
print("Listings loaded:")
print(listings.head())
print("Shape:", listings.shape)

Listings loaded:
       id                          listing_url       scrape_id last_scraped  \
0  264776  https://www.airbnb.com/rooms/264776  20250610032232   2025-06-11   
1  264777  https://www.airbnb.com/rooms/264777  20250610032232   2025-06-11   
2  264778  https://www.airbnb.com/rooms/264778  20250610032232   2025-06-11   
3  264779  https://www.airbnb.com/rooms/264779  20250610032232   2025-06-11   
4  264780  https://www.airbnb.com/rooms/264780  20250610032232   2025-06-11   

        source                                             name  \
0  city scrape                      Huge Four Bedroom Apartment   
1  city scrape                            One Bedroom Apartment   
2  city scrape          Two Bedroom Newly Refurbished Apartment   
3  city scrape                Refurbished Two Bedroom Apartment   
4  city scrape  Spacious refurbished 2 bedroom apt with balcony   

                                         description  \
0  An extremely large and sunny four bedroom grou

In [17]:
cols = [
    "id", "host_id", "room_type",
    "availability_365", "neighbourhood_cleansed",
    "latitude", "longitude"
]

df = listings[cols].copy()    
print("Columns loaded:", df.columns)

Columns loaded: Index(['id', 'host_id', 'room_type', 'availability_365',
       'neighbourhood_cleansed', 'latitude', 'longitude'],
      dtype='object')


In [18]:
# calculate host's listing 
host_stats = df.groupby("host_id").agg(
    total_listings = ('id','count'),
    entire_homes   = ('room_type', lambda x: (x=="Entire home/apt").sum()),
    avail_over_90  = ('availability_365', lambda x: (x>90).sum())
).reset_index()

In [19]:
host_stats["is_PL"] = (
    ((host_stats.total_listings >= 3) | (host_stats.entire_homes >= 2))
    & (host_stats.avail_over_90 > 0)
)

final_hosts = host_stats[host_stats.is_PL].host_id.unique()

In [30]:
Q2 = len(final_hosts)
print("Q2 Number of professional landlords =", Q2)

Q2 Number of professional landlords = 5345


In [27]:
df["is_PL_listing"] = df["host_id"].isin(final_hosts)
Q31 = df["is_PL_listing"].sum()
print("Q3.1 Number of Properties Owned by Professional Landlords =", Q31)

Q3.1 Number of Properties Owned by Professional Landlords = 37609


In [29]:
total_listings = len(df)
Q32 = Q31 / total_listings
print("Q3.2 Proportion of properties listed by professional landlords =", round(Q32,4))

Q3.2 Proportion of properties listed by professional landlords = 0.3891
