In [25]:
from pathlib import Path
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from IPython.display import display
pd.options.display.max_columns = None

%matplotlib inline

In [26]:
HOME = Path().resolve().parent.parent
data_dir = str(HOME) + '/data/processed/'

df = pd.read_csv(data_dir + 'listings_agg.csv.gz')

rows, cols = df.shape
print('The original dataset has {} rows and {} columns.'.format(rows, cols))

The original dataset has 550766 rows and 25 columns.


In [29]:
listings = df.copy()

## Dealing with missing values

In [30]:
count_na = df.isna().sum()
count_na[count_na > 0].sort_values(ascending=False)

bathrooms                       174165
review_scores_rating            146655
reviews_per_month               140309
bedrooms                          6484
beds                              6466
CasosCovMes                         14
CasosCovMesAnt                      14
availability_60                      7
amenities                            7
calendar_last_scraped                7
availability_365                     7
availability_30                      7
latitude                             7
price                                7
availability_90                      7
number_of_reviews                    7
room_type                            7
accommodates                         7
longitude                            7
has_availability                     7
neighbourhood_group_cleansed         7
property_type                        7
dtype: int64

### 1. Remove the rows where there are less that 1% of missing values

These 7 samples with missing values are those we have seen during the aggreagation process. Thus, removing then now, we do not have to deal with it during the cleaning process.

In [31]:
listings['id'] = listings['id'].map(lambda x: pd.to_numeric(x, errors='coerce'))
listings['review_scores_rating'] = listings['review_scores_rating'].map(lambda x: pd.to_numeric(x, errors='coerce'))

# If a column has less than 1% of missing values and it is not trivial filling it, the row can be removed:
listings = listings.dropna(
    subset=['latitude', 'price' , 
          'number_of_reviews', 'longitude', 'availability_365', 
          'availability_60', 'file_date', 'has_availability', 'id', 
          'availability_30', 'neighbourhood_group_cleansed', 
          'availability_90', 'calendar_last_scraped', 'last_scraped']).copy()

It count how many remaining missing values are in the dataset:

In [32]:
count_na = listings.isna().sum()
count_na[count_na > 0].sort_values(ascending=False)

bathrooms               174156
review_scores_rating    146648
reviews_per_month       140295
bedrooms                  6477
beds                      6459
dtype: int64

The most of ```bathrooms```, ```bedrooms``` and ```beds``` are filled by 1. So:

In [33]:
listings[['bathrooms', 'bedrooms', 'beds']] = listings[['bathrooms', 'bedrooms', 'beds']].fillna(1)

For the ```reviews_per_month``` and ```reviews_per_month```, we need a different approach:
 + Fill in with zero when there is no reviews at all
 + Fill in with the median of the reviews for the same listing

In [34]:
def fill_reviews(df, identif, review):
    
    # Case 1: Take the id of places without any review and set its review to zero
    count_reviews = df[[identif, review]].groupby(identif).count()
    ids_no_reviews = count_reviews[count_reviews[review] == 0].index
    idx1 = df[df[identif].isin(ids_no_reviews)].index
    df.loc[idx1, review] = 0.0

    # Case 2: If a place has some 'review' and some missing value, 
    # fill in the missing value with the median of its review
    median_review = df[[identif, review]].groupby(identif).median()
    idx2 = df[df[review].isna()].index
    df.loc[idx2, review] = df.loc[idx2, identif].map(lambda x: median_review.loc[x])
    
    return df

Filling in the missing values in the ```review_scores_rating``` column:

In [35]:
listings = fill_reviews(listings, 'id', 'review_scores_rating')

Filling the missing values in the ```reviews_per_month``` column:

In [36]:
listings = fill_reviews(listings, 'id', 'reviews_per_month')

The dataset now has no missing values

In [37]:
count_na = listings.isna().sum()
count_na[count_na > 0].sort_values(ascending=False)

Series([], dtype: int64)

### 4. Save processed dataset

In [38]:
listings.to_csv(data_dir + 'listings_notna.csv.gz', index=False)