In [1]:
from pathlib import Path
import os
import glob
import pandas as pd

%matplotlib inline

In [577]:
HOME = Path().resolve().parent.parent
data_dir = str(HOME) + '/data/meus dados/'

In [578]:
# use your path
all_files = glob.glob(data_dir + "*.csv.gz")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0, low_memory=False)
    li.append(df)
    
frame = pd.concat(li, axis=0, ignore_index=True)

In [579]:
frame.shape

(144067, 106)

### 1. Removing columns pointed by Pol:

In [580]:
# Removed 'bathrooms_text' and 'number_of_reviews_l30d' from cols2drop

cols2drop = ['listing_url', 'scrape_id', 'name', 'description',
       'neighborhood_overview', 'picture_url', 'host_id', 'host_url',
       'host_name', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'property_type', 'room_type', 'accommodates', 'bathrooms',
        'amenities',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm','host_since', 'neighbourhood',
       'calendar_updated', 'review_scores_communication', 'review_scores_location', 'license', 'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms', 'host_neighbourhood',
       'neighbourhood_cleansed', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_value', 
       'number_of_reviews_ltm', 'first_review', 
       'last_review', 'instant_bookable']

In [581]:
listings = frame.drop(columns=cols2drop).copy()

### 2. Removing columns with more than 50% of missing values

In [582]:
too_many_na = listings.loc[:, listings.isnull().sum().values > listings.shape[0]/2].columns.values

In [583]:
cols2drop = cols2drop + list(too_many_na)

In [584]:
listings = frame.drop(columns=list(cols2drop))

### 3. Dealing with the remaining missing values

In [585]:
count_na = listings.isna().sum()
count_na[count_na > 0].sort_values(ascending=False)

access                  70098
interaction             62921
transit                 57857
house_rules             48283
security_deposit        42335
space                   39413
review_scores_rating    33485
reviews_per_month       31928
cleaning_fee            29686
summary                  7773
zipcode                  4085
state                    3407
host_location            1121
beds                      781
bedrooms                  139
market                    115
city                       32
dtype: int64

In [586]:
listings['access'].fillna('Not informed', inplace=True)

listings['interaction'].fillna('Not informed', inplace=True)

listings['transit'].fillna('Not informed', inplace=True)

listings['house_rules'].fillna('Not informed', inplace=True)

listings['security_deposit'].fillna('$0.00', inplace=True)

listings['space'].fillna('Not informed', inplace=True)

# Take the id of places without any 'review_scores_rating' and set its review to zero
count_reviews = listings[['id', 'review_scores_rating']].groupby('id').count()
ids_no_reviews = count_reviews[count_reviews.review_scores_rating == 0].index
idx1 = listings[listings.id.isin(ids_no_reviews)].index
listings.loc[idx1, 'review_scores_rating'] = 0.0

# If a place has some 'review_scores_rating' and some misisng values, 
# fill the missing values with the median of its review
median_review = listings[['id', 'review_scores_rating']].groupby('id').median()
idx2 = listings[listings.review_scores_rating.isna()].index
listings.loc[idx2, 'review_scores_rating'] = listings.loc[idx2, 'id'].map(lambda x: median_review.loc[x])

listings['reviews_per_month'].fillna(0.0, inplace=True)

listings['cleaning_fee'].fillna('$0.00', inplace=True)

listings['summary'].fillna('Not informed', inplace=True)

listings['state'].fillna('Catalunya', inplace=True)

listings['zipcode'].fillna('08001', inplace=True)

listings['city'].fillna('Barcelona', inplace=True)

listings['market'].fillna('Barcelona', inplace=True)

In [587]:
count_na = listings.isna().sum()
count_na[count_na > 0].sort_values(ascending=False)

host_location    1121
beds              781
bedrooms          139
dtype: int64

### 4. Removing the row where there are less that 1% of missing values

In [588]:
# If a column has less than 1% of missing values and it is not trivial filling it, the row can be removed:
listings = listings.dropna()

### 5. Save processed dataset

In [589]:
listings.to_csv(data_dir + 'listings_notna_1908-2002.csv', index=False)