# Notebook to build baseline datasets, in the UK and US

In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import *

## In the UK

### Choosing the cities

In [None]:
cities = ["bristol","manchester","london"]

### Loading the listings

In [None]:
frames = []
for i in cities:
    new_listings = pd.read_csv("../original_data/airbnb_data/" + i + "_listings.csv")[['id','name','room_type','latitude','longitude']]
    frames.append(new_listings)
listings = pd.concat(frames).reset_index(drop=True)

### Filtering the data
 - Keeping only entire properties
 - Dropping duplicates

In [None]:
listings_ = listings[listings['room_type'] == 'Entire home/apt']
listings_ = listings_.drop_duplicates('name').reset_index(drop=True)

### Adding wards and counting properties per area

In [None]:
gdf = gpd.read_file('../original_data/UK_wards_2017/Wards__December_2017__Boundaries_in_GB.shp').to_crs(epsg=4326)

In [None]:
ward_to_add = []
size = listings_.shape[0]
for i in range(size):
    print("{}/{}".format(i, size), end='\r')
    point = Point(listings_["longitude"][i], listings_["latitude"][i])
    value = None
    for j in range(gdf.shape[0]):
        if gdf["geometry"][j].contains(point):
            value = gdf["wd17cd"][j]
            break
    ward_to_add.append(value)
listings_["ward"] = ward_to_add

In [None]:
ward_count = pd.DataFrame(listings_['ward'].value_counts()).reset_index().rename(columns={"index": "ward", "ward": "count"})

### Saving to .csv

In [None]:
ward_count.to_csv("../data/uk_metrics_baseline.csv", index=False)

## In the US

### Choosing the cities

In [None]:
cities = ["san_francisco","chicago"]
tracts = ["california_tracts_2018/cb_2018_06_tract_500k.shp","illinois_tracts_2018/tl_2018_17_tract.shp"]
modif = [False,True]

### Loading the listings

In [None]:
frames = []
for i in range(len(cities)):
    new_listings = pd.read_csv("original_data/airbnb_data/" + cities[i] + "_listings.csv")[['id','name','room_type','latitude','longitude']]
    
    gdf = gpd.read_file("original_data/" + tracts[i]).to_crs(epsg=4326)
    tract_to_add = []
    for k in range(new_listings.shape[0]):
        point = Point(new_listings["longitude"][k], new_listings["latitude"][k])
        value = None
        for j in range(gdf.shape[0]):
            if gdf["geometry"][j].contains(point):
                if (modif[i]):
                    value = "1400000US" + gdf["GEOID"][j]
                else:
                    value = gdf["AFFGEOID"][j]
                break
        tract_to_add.append(value)
    new_listings["tract"] = tract_to_add
    
    frames.append(new_listings)
listings = pd.concat(frames).reset_index(drop=True)

### Filtering the data
 - Keeping only entire properties
 - Dropping duplicates

In [None]:
listings_ = listings[listings['room_type'] == 'Entire home/apt']
listings_ = listings_.drop_duplicates('name').reset_index(drop=True)

### Counting properties per area

In [None]:
tract_count = pd.DataFrame(listings_['tract'].value_counts()).reset_index().rename(columns={"index": "tract", "tract": "count"})

### Saving to .csv

In [None]:
tract_count.to_csv("data/us_metrics_baseline.csv", index=False)