Created by: [SmirkyGraphs](http://smirkygraphs.github.io/). Code: [Github](https://github.com/SmirkyGraphs/Python-Notebooks). Source: [insideairbnb.com](http://insideairbnb.com/get-the-data.html).
<hr>

# Rhode Island Airbnb Data Cleaning

This notebook contains code used to combine all data from insideairbnb.com. Combining all data collected will ensure that we get not only all listings, but any that may have "dropped-out", and no longer list their location on airbnb. Additionally adding some wanted data to the reviews, such as city and review sentiment score using textblob.

Tableau project based off the final cleaned dataset: [here](https://ivizri.com/posts/2020/07/rhode-island-airbnb/).
<hr>

In [1]:
import pandas as pd
from pathlib import Path
from textblob import TextBlob

In [2]:
def combine_csv(files, filter_cols=None):
    data = []
    for f in files:
        df = pd.read_csv(f)
        df['file_name'] = str(f.parent)[-10:]
        data.append(df)
        
    # combine files
    df = pd.concat(data, sort=True)
    
    # convert filename to datetime & sort values 
    df['file_name'] = df['file_name'].apply(lambda x: pd.datetime.strptime(x, '%m_%d_%Y'))
    df = df.sort_values('file_name')
    
    # filter columns for duplicates
    if filter_cols == None:
        filter_cols = [x for x in list(df) if x != 'file_name']
    df = df.drop_duplicates(subset=filter_cols, keep='last')
    
    return df

In [3]:
want_cols = [
    'id',
    'host_id',
    'host_acceptance_rate',
    'host_is_superhost',
    'host_response_rate',
    'host_response_time',
    'host_since',
    'host_location',
    'property_type',
    'room_type',
    'accommodates',
    'bathrooms',
    'bedrooms',
    'neighbourhood_cleansed',
    'neighbourhood_group_cleansed',
    'zipcode',
    'latitude',
    'longitude',
    'price',
    'security_deposit',
    'cleaning_fee',
    'number_of_reviews',
    'number_of_reviews_ltm',
    'first_review',
    'last_review',
    'review_scores_rating',
    'availability_365',
    'bed_type',
    'file_name',
    'name',
    'host_name',
    'minimum_nights'
]

In [4]:
# combine & clean all listings
listing_files = Path('./data/raw/').glob('**/listings.csv')
list_df = combine_csv(listing_files, filter_cols=['id'])
list_df = list_df[want_cols]

# filter for not_ri listings
list_df['in_ri'] = list_df['host_location'].apply(
lambda x: 
    'rhode island' in str(x).lower()
    or ', ri' in str(x).lower() 
    or ' ri' in str(x).lower()
    or str(x).lower() == 'nan'
    or str(x).lower().strip() == 'usa'
    or str(x).lower().strip() == 'us'
    or str(x).lower() == 'ri'
    or str(x).lower() == 'united states'
)

list_df.to_csv('./data/clean/listings_clean.csv', index=False)

In [5]:
def blob_sentiment(review):
    review = str(review)
    rev = TextBlob(review)
    
    if rev.sentiment.polarity > 0.02:
        return 'positive'
    elif rev.sentiment.polarity < -0.02:
        return 'negative'
    else:
        return 'neutral'
    
# combine & clean all reviews
review_files = Path('./data/raw/').glob('**/reviews.csv')
rev_df = combine_csv(review_files, filter_cols=['id'])

# remove null comments & automatic postings
rev_df = rev_df[~rev_df['comments'].isnull()]
rev_df = rev_df[~rev_df['comments'].str.contains('This is an automated posting.')]

# join listing by city
list_df = list_df.rename(columns={'id': 'listing_id'})
rev_df = rev_df.merge(list_df[['listing_id', 'neighbourhood_cleansed']], how='left', on='listing_id')

# add sentiment from textblob
rev_df['blob_sentiment'] = rev_df['comments'].apply(blob_sentiment)
rev_df.to_csv('./data/clean/reviews_clean.csv', index=False)