# Data preparation - Seatle AirBnB

In [98]:
#Importing standard liabraries

%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns

In [99]:
# Uploading AirBnB data sets

calendar_df = pd.read_csv('calendar.csv')

listing_df = pd.read_csv('listings.csv')

reviews_df = pd.read_csv('reviews.csv')

As we've seen in the __Data understanding__ step: 

- Some of the attributes from Seattle dataset have missing data
    - Luckily for us only a small proportion of those attributes we are going to use in the model have around 10% of missing values. The rest of data is complete.
- We've identified highly correlated attributes from the same categories
    - Those attributes will be excluded from the training model 

It certainly does not make sense to use all categorical and numerical values for our data model. Hence we will focus on the most reasonable predictors. For instance for numerical variables we will exclude all highly  correlated attributes, this will save us from multicollinearity issues in the future. (See Data Understanding - correlation heatmap)

As for categorical, we will have to apply a bit of common sense and exclude such as: state, city, picture_url, name etc.

## Removing less usefull attributes

In [100]:
# Removing all rows from listing_df that don't have price records

listing_df = listing_df.dropna(subset=['price'],axis=0)


In [101]:
# Cleaning categorical variables. 



listing_df = listing_df.drop(columns = ['amenities','calendar_updated','cancellation_policy','city',
                                               'space', 'neighborhood_overview', 'notes',
                                               'medium_url', 'xl_picture_url', 'host_url', 'is_location_exact',
                                               'country','country_code','description','experiences_offered','calendar_last_scraped',
                                               'host_has_profile_pic','host_id','host_name','host_picture_url', 'host_about', 'host_verifications',
                                               'host_thumbnail_url','listing_url','market','name','neighbourhood_cleansed',
                                               'jurisdiction_names', 'require_guest_profile_picture', 'smart_location',
                                               'picture_url','scrape_id','state','street','summary','thumbnail_url','zipcode'])



listing_df.columns

Index(['id', 'last_scraped', 'transit', 'host_since', 'host_location',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_group_cleansed', 'latitude', 'longitude',
       'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'bed_type', 'square_feet', 'price', 'weekly_price',
       'monthly_price', 'security_deposit', 'cleaning_fee', 'guests_included',
       'extra_people', 'minimum_nights', 'maximum_nights', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews', 'first_review', 'last_review',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       're

In [102]:
# Cleaning nuemrical variables. 

listing_df = listing_df.drop(columns = ['host_response_rate','accommodates','bedrooms',
                                                      'host_listings_count','host_total_listings_count', 
                                                     'guests_included','availability_60','availability_90', 'availability_365',
                                                     'review_scores_accuracy','review_scores_cleanliness','review_scores_checkin',
                                                     'review_scores_communication','review_scores_value'])
listing_df.columns

Index(['id', 'last_scraped', 'transit', 'host_since', 'host_location',
       'host_response_time', 'host_acceptance_rate', 'host_is_superhost',
       'host_neighbourhood', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_group_cleansed', 'latitude', 'longitude',
       'property_type', 'room_type', 'bathrooms', 'beds', 'bed_type',
       'square_feet', 'price', 'weekly_price', 'monthly_price',
       'security_deposit', 'cleaning_fee', 'extra_people', 'minimum_nights',
       'maximum_nights', 'has_availability', 'availability_30',
       'number_of_reviews', 'first_review', 'last_review',
       'review_scores_rating', 'review_scores_location', 'requires_license',
       'license', 'instant_bookable', 'require_guest_phone_verification',
       'calculated_host_listings_count', 'reviews_per_month'],
      dtype='object')

In [103]:
# Let's review attributes with more than 30% of missing values 

missing_30_pct_values = list(listing_df.columns[listing_df.isnull().mean() > 0.3])

missing_30_pct_values

['square_feet', 'weekly_price', 'monthly_price', 'security_deposit', 'license']

In [104]:
# Let's drop columns with more than 30% missing values

listing_df = listing_df.drop(columns = ['first_review','host_acceptance_rate',
                                                     'last_review','license','monthly_price','neighbourhood',
                                                     'reviews_per_month','security_deposit','square_feet',
                                                     'transit','weekly_price'])

## Data Cleaning

### Numerical functions

In [105]:
# defining fucntions for cleaning numerical values

# Function for converting string to datetime
import datetime
def str_to_date(x):
    '''function for converting string to datetime'''
    x = datetime.datetime.strptime(x, "%Y-%m-%d").date()
    return x

# Function for filling missing values with mean - this will work on low 10% numerical missing values
def fill_mode(x):
    '''Function for filling missing values with mean'''
    x  = x.fillna(x.mode())
    return x
    
# Let's re-use our financial_to_float function from data exploration step
def financial_to_float(x):
    '''function for converting text values with $ into float variables'''
    x = x.replace(",", "")
    return float(x.strip('$'))


### Cleaning categorical variables

In [106]:
# defining fucntions for cleaning categorical values

# Here we will replace missing categorical variables with the most frequent value - mode 
def impute_nan_most_frequent_category(DataFrame,ColName):
    # .mode()[0] - gives first category name
     most_frequent_category=DataFrame[ColName].mode()[0]
    
    # replace nan values with most occured category
     DataFrame[ColName + "_Imputed"] = DataFrame[ColName]
     DataFrame[ColName + "_Imputed"].fillna(most_frequent_category,inplace=True)

### Assembling main function

In [107]:
# defining main cleaning function

financials = ['price_x','cleaning_fee','extra_people']
numerics = [['bathrooms','beds','cleaning_fee','review_scores_rating','review_scores_location','extra_people']]
categorical_varaibles = ['host_response_time','host_is_superhost','host_identity_verified','property_type']

def clean_data (listing_df,calendar_df):
    '''
    INPUT
    listings_df - dataframe containing listings data
    calendar_df - dataframe with availability and prices
    
    OUTPUT 
    clean data
    '''
    # merging listings and calendar at this step
    listing_df = listing_df.rename(columns={"id": "listing_id"})
    full_df = pd.merge(calendar_df,listing_df, on = 'listing_id')
    
    # subsetting only rows with price from calendar - price on available dates
    full_df = full_df[pd.notnull(full_df["price_x"])]
    
    # converting last_scraped into datetime object
    full_df['last_scraped'] = full_df['last_scraped'].apply(str_to_date)
    
    # we fill nan for host_since with '2016-01-04' - the scrape date
    full_df['host_since'] = full_df['host_since'].fillna(str('2016-01-04'))
    full_df['host_since'] = full_df['host_since'].astype(str).apply(str_to_date)
    full_df['host_since_year'] = pd.DatetimeIndex(full_df['host_since']).year
    full_df['host_since_year'] = full_df['host_since_year'].astype(int)
    
    # formatting financial values
    for i in financials:
        full_df[i] = full_df[i].fillna('$0').apply(financial_to_float)
        
    # Imputing low 10% of missing numerics with mode values
    for i in numerics:
        full_df[i] = full_df[i].apply(fill_mode, axis = 0)
    
    # Imputing low 10% of missing categorical with mode values
    for i in categorical_varaibles:
        impute_nan_most_frequent_category(full_df,i)
    
    return full_df

In [108]:
full_df = clean_data (listing_df,calendar_df)

In [109]:
full_df[['bathrooms','beds','host_response_time','host_is_superhost']]

Unnamed: 0,bathrooms,beds,host_response_time,host_is_superhost
0,1.0,1.0,within a few hours,f
1,1.0,1.0,within a few hours,f
9,1.0,1.0,within a few hours,f
10,1.0,1.0,within a few hours,f
14,1.0,1.0,within a few hours,f
...,...,...,...,...
1393207,1.5,1.0,within a day,f
1393208,1.5,1.0,within a day,f
1393211,1.5,1.0,within a day,f
1393212,1.5,1.0,within a day,f


In [None]:
listings_clean_df.columns

In [None]:
listings_clean_df.head(5)

In [None]:
# Now we can drop last_scraped & 
listings_clean_df = listings_clean_df.drop(columns = ['last_scraped','host_since'])

listings_clean_df.head(5)

In [None]:
listings_clean_df.info()

## Data Cleaning

### Cleaning numerical values

In [None]:
# Now let's see what are the columns that have missing values

listings_clean_df.columns[listings_clean_df.isna().any()].tolist()

In [None]:
listings_clean_df['bathrooms'] = listings_clean_df[['bathrooms']].apply(fill_mean, axis=0)

In [None]:
listings_clean_df[['bathrooms','beds','cleaning_fee','review_scores_rating','review_scores_location']].head(5)

### Cleaning categorical variables

In [None]:
listings_clean_df = listings_clean_df.drop(['host_location','host_neighbourhood'], axis = 1)

In [None]:
# Here we will replace missing categorical variables with the most frequent value - mode 
# Let's create a function for that

def impute_nan_most_frequent_category(DataFrame,ColName):
    # .mode()[0] - gives first category name
     most_frequent_category=DataFrame[ColName].mode()[0]
    
    # replace nan values with most occured category
     DataFrame[ColName + "_Imputed"] = DataFrame[ColName]
     DataFrame[ColName + "_Imputed"].fillna(most_frequent_category,inplace=True)

In [None]:
categorical_varaibles = ['host_response_time','host_is_superhost','host_identity_verified','property_type']

for i in categorical_varaibles:
    impute_nan_most_frequent_category(listings_clean_df,i)
    

In [None]:
listings_clean_df.head(5)

In [None]:
listings_clean_df = listings_clean_df.drop(columns = ['host_response_time','host_is_superhost','host_identity_verified','property_type'])

In [None]:
listings_clean_df.columns.to_list()

In [None]:
# Dummy the categorical variables
cat_vars = listings_clean_df.select_dtypes(include=['object']).copy().columns

for var in  cat_vars:
        # for each cat add dummy var, drop original column
        listings_clean_df = pd.concat([listings_clean_df.drop(var, axis=1), pd.get_dummies(listings_clean_df[var], prefix=var, prefix_sep='_d', drop_first=True)], axis=1)

In [None]:
listings_clean_df.head(5)

In [None]:
print(listings_clean_df.info())

listings_clean_df.columns.to_list()

In [None]:
listings_clean_df.to_csv('listings_clean.csv',index=False)