### The purpose of this notebook is to read both Listings & Reviews (.gz zipped) Airbnb files, then clean and merge them.

The inputs for this notebook are downloaded Airbnb files from Inside Airbnb (http://insideairbnb.com/get-the-data.html). The files of interest are listings.csv.gz and reviews.csv.gz. Note that the files are supposed to be cumulative, so a recent file should contain all previous reviews/listings. However, some listings drop off the Airbnb site, so they may only appear for a short time. Hence, we download multiple years, combine data, and then de-duplicate for a more complete dataset.

#### Import necessary modules

In [None]:
import os
import pandas as pd
import numpy as np
import langdetect

#### Set input parameters below:

In [None]:
directory = 'data'
city = 'SanFrancisco'
beginDate = '2018-01-01' # beginning of interested time horizon
COVIDdate = '2020-03-11'

#### Create listing dataframe for one city by combining files over multiple years, while de-duplicating

In [None]:
cityList = pd.DataFrame()
records = 0
for f in os.listdir(directory):
    if (f != ".DS_Store") and ("gz" in f) and (city.lower() in f.lower()) and ('list' in f):
        print(f)
        df = pd.read_csv(os.path.join(directory, f), compression = "gzip")
        print(len(df), 'records.')
        records += len(df)
        # select only relevant columns; note 2021 omits zipcode
        if 'zipcode' in df.columns:
            df = df[['id','neighbourhood_cleansed','price','zipcode','property_type','room_type','review_scores_rating']]
        else:
            df = df[['id','neighbourhood_cleansed','price','property_type','room_type','review_scores_rating']]
        # rename colunns for later use & append
        df.rename(columns = {'id':'listing_id','neighbourhood_cleansed':'neighborhood','property_type':'property','room_type':'room','review_scores_rating':'rating'}, inplace = True)
        cityList = cityList.append(df)
        # drop duplicate listings; earlier files contain zipcode, but 2021 file has most current price
        cityList = cityList.drop_duplicates(subset='listing_id', keep='first')
print('\n')
print(records, 'total listings processed...')
print('de-duplicated, combined Listing dataframe for ', city,':', sep='')
cityList

#### Create review dataframe for one city by combining files over multiple years, while de-duplicating

In [None]:
cityReviews = pd.DataFrame()
records = 0
for f in os.listdir(directory):
    if (f != ".DS_Store") and ("gz" in f) and (city.lower() in f.lower()) and ('review' in f):
        print(f)
        df = pd.read_csv(os.path.join(directory, f), compression = "gzip")
        print(len(df), 'records.')
        records += len(df)
        # select only relevant columns and convert date
        df = df[['listing_id', 'id', 'date', 'comments']]
        df['date'] = pd.to_datetime(df['date'])
        # append
        cityReviews = cityReviews.append(df)
        # drop duplicates & null reviews
        cityReviews = cityReviews.drop_duplicates(subset='id')
        cityReviews.dropna(subset=['comments'], inplace=True)

print('\n')        
print(records, 'total reviews processed...')
# filter based on date to obtain time horizon
cityReviews = cityReviews[(cityReviews['date'] > beginDate)]
timeReviews = len(cityReviews)
print(timeReviews, 'de-duplicated reviews within time horizon of', beginDate)
# determine percentage of Pre/Post-COVID data
cityReviews['COVID'] = np.where(cityReviews['date'] > COVIDdate, 'Post-', 'Pre-')
print('> COVID data breakdown:')
COVIDper = round(100*len(cityReviews.loc[cityReviews["COVID"]=='Post-'])/timeReviews, 2)
print(COVIDper, '% of the reviews for',city,'are post-COVID.')
cityReviews

#### Merge listings & reviews into one dataframe for one city

In [None]:
city_data = pd.merge(cityList, cityReviews, on='listing_id')
city_data

#### Cleanse by filtering out non-English reviews

In [None]:
def modifieddetect(comment):
    try:
        return langdetect.detect(comment)
    except:
        return "Error"
%time city_data["language"] = city_data["comments"].apply(modifieddetect)
city_data = city_data.loc[city_data['language']=='en'].drop(['language','id'],axis=1)
print('\n')
print('>> ',len(city_data),' reviews from ',city,', beginning ',beginDate,', are ready for analysis!', sep='')
city_data

#### Write output file in .csv format, compressed (.gz zipped)

In [None]:
#file = city + '.gz'
file = city + '.csv.gz'
city_data.to_csv(file, index=False, compression='gzip')