# Notebook to build an Airbnb dataset in the UK

In [None]:
import csv
import pandas as pd
import geopandas as gpd
from shapely.geometry import *
from langdetect import detect
import textstat
from fuzzywuzzy import fuzz

from scripts import clean_text

### Choose cities

In [None]:
cities = ["bristol","manchester","london"]

### Loading listings
 - Loading the dataset
 - Concatenating the description fields
 - Doing some data cleaning

In [None]:
frames = []
for i in cities:
    new_listings = pd.read_csv("../original_data/airbnb_data/" + i + "_listings.csv")[['id','summary','space','description','neighborhood_overview','notes','interaction','house_rules','host_about','host_id','latitude','longitude']]
    frames.append(new_listings)
listings = pd.concat(frames).reset_index(drop=True)

columns = ['summary','space','description','neighborhood_overview','notes','interaction','house_rules','host_about']
to_add = []
for i in range(listings.shape[0]):
    description = ""
    for j in columns:
        item = listings[j][i]
        if (isinstance(item, str)):
            description += clean_text(item)
    description = description[:-1]
    to_add.append(description)
listings['full_description'] = to_add

listings = listings[['id', 'full_description', 'latitude', 'longitude', 'host_id']]

### First round of data cleaning
 - Removing duplicate descriptions
 - Removing too short descriptions
 - Removing rows with non-English description

In [None]:
listings = listings.drop_duplicates('full_description').reset_index(drop=True)

In [None]:
to_add = []
for i in range(listings.shape[0]):
    sent_count = textstat.sentence_count(listings['full_description'][i])
    to_add.append(sent_count)
listings['sent_count'] = to_add
listings = listings[listings['sent_count'] > 2].reset_index(drop=True).drop('sent_count', axis=1)

In [None]:
to_add = []
for i in range(listings.shape[0]):
    try:
        blob = detect(listings['full_description'][i])
        to_add.append(blob)
    except:
        to_add.append("None")
listings['lang'] = to_add
listings = listings[listings['lang'] == "en"].reset_index(drop=True).drop('lang', axis=1)

### Adding the wards to the listings
 - Loading the shapefile for UK Wards
 - Using coordinates to determine wards

In [None]:
gdf = gpd.read_file('../original_data/UK_wards_2017/Wards__December_2017__Boundaries_in_GB.shp').to_crs(epsg=4326)

In [None]:
ward_to_add = []
size = listings.shape[0]
for i in range(size):
    print("{}/{}".format(i, size), end='\r')
    point = Point(listings["longitude"][i], listings["latitude"][i])
    value = None
    for j in range(gdf.shape[0]):
        if gdf["geometry"][j].contains(point):
            value = gdf["wd17cd"][j]
            break
    ward_to_add.append(value)
listings["ward"] = ward_to_add

### Final cleaning of the dataset
 - Removing 'None' wards (not found, in the water, etc)
 - Removing 'Fuzzy' duplicate descriptions

In [None]:
listings = listings[listings['ward'].notnull()].reset_index(drop=True)

In [None]:
idx = 0
while (idx < listings.shape[0]):
    print("{}/{}".format(idx, listings.shape[0]), end='\r')
    subset = listings[(listings['ward'] == listings['ward'][idx]) & (listings['host_id'] == listings['host_id'][idx])].drop(idx)
    to_remove = []
    for index, row in subset.iterrows():
        if (fuzz.ratio(listings['full_description'][idx], row['full_description']) > 60):
            to_remove.append(index)
    listings = listings.drop(listings.index[to_remove]).reset_index(drop=True)
    idx += 1

### Saving the dataset to .csv

In [None]:
listings_out = listings[['id','full_description','ward']]

In [None]:
listings_out.to_csv("../data/airbnb_listings_description/uk_listings_description_ward.csv", index=False)