In [1]:
from pathlib import Path
import pandas as pd
import re

import warnings
warnings.filterwarnings('ignore')

from IPython.display import display
pd.options.display.max_columns = None

In [2]:
HOME = Path().resolve().parent.parent
data_dir = str(HOME) + '/data/processed/'

In [5]:
df = pd.read_csv(data_dir + 'listings_notna.csv.gz', parse_dates=['last_scraped'])
df.shape

(548412, 24)

In [17]:
listings = df.copy()

In [18]:
listings.head()

Unnamed: 0,beds,bedrooms,number_of_reviews,file_date,amenities,calendar_last_scraped,longitude,room_type,bathrooms,price,host_location,id,reviews_per_month,availability_60,accommodates,review_scores_rating,property_type,last_scraped,latitude,availability_30,neighbourhood_group_cleansed,availability_90,availability_365,has_availability
0,4.0,2.0,1.0,1901,"{TV,Internet,Wifi,""Air conditioning"",""Wheelcha...",2019-01-15,2.185545,Entire home/apt,1.0,$130.00,"Barcelona, Cataluña, Spain",18666.0,0.03,0.0,6.0,80.0,Apartment,2019-01-15,41.408886,0.0,Sant Martí,0.0,6,t
1,6.0,3.0,5.0,1901,"{TV,Internet,Wifi,""Air conditioning"",""Wheelcha...",2019-01-14,2.173058,Entire home/apt,2.0,$60.00,"Barcelona, Cataluña, Spain",18674.0,0.07,27.0,8.0,85.0,Apartment,2019-01-14,41.404197,12.0,Eixample,55.0,326,t
2,2.0,1.0,4.0,1901,"{Wifi,""Air conditioning"",Kitchen,""Paid parking...",2019-01-14,2.170701,Private room,1.0,$42.00,"Barcelona, Cataluña, Spain",31377.0,0.09,23.0,2.0,95.0,Apartment,2019-01-14,41.410969,14.0,Horta-Guinardó,30.0,184,t
3,2.0,1.0,39.0,1901,"{Wifi,""Air conditioning"",Kitchen,""Paid parking...",2019-01-14,2.170819,Private room,1.0,$53.00,"Barcelona, Cataluña, Spain",31380.0,0.9,38.0,3.0,87.0,Apartment,2019-01-14,41.4109,24.0,Horta-Guinardó,47.0,204,t
4,1.0,1.0,151.0,1901,"{TV,Wifi,""Air conditioning"",Kitchen,Elevator,H...",2019-01-14,2.159376,Entire home/apt,1.0,$60.00,"Barcelona, Cataluña, Spain",31958.0,1.57,52.0,4.0,91.0,Apartment,2019-01-14,41.409498,25.0,Gràcia,80.0,342,t


### 1. Dropping useless columns

In [19]:
cols2drop = ['has_availability', 'calendar_last_scraped']

# All values of 'has_availability' == 't'
# The date in 'calendar_last_scraped' is equal to 'last_scraped'
listings = listings.drop(columns=cols2drop)

### 2. Dealing with numerical columns

In [20]:
numerical_features = ['beds','bedrooms', 'number_of_reviews', 'price', 'longitude', 'latitude', 
                      'bathrooms', 'id', 'reviews_per_month', 'availability_30', 'availability_60', 
                     'availability_90', 'availability_365', 'accommodates', 'review_scores_rating']

In [22]:
float2int = ['beds','bedrooms', 'number_of_reviews', 'bathrooms', 'id', 'availability_30', 
             'availability_60', 'availability_90', 'availability_365', 'accommodates']

listings[float2int] = listings[float2int].applymap(lambda x: int(x))

Remove any symbol different than digit and '.' from numerical float columns:

In [50]:
ncols2clean = ['price', 'reviews_per_month', 'review_scores_rating']

regex = re.compile(r'(\d+\.\d{1,2})')
listings[ncols2clean] = listings[ncols2clean].applymap(lambda x: pd.to_numeric(re.search(regex, x).group(0)))

### 3. Removing listings with no avaibility

In [54]:
idx2drop = listings[
    (listings['availability_30'] == 0) & 
    (listings['availability_60'] == 0) &
    (listings['availability_90'] == 0) & 
    (listings['availability_365'] == 0)].index

In [55]:
print('Removing {} rows from the dataset.'.format(len(idx2drop)))

Removing 91425 rows from the dataset.


In [56]:
unavailable_listings = listings.loc[idx2drop]
listings = listings.drop(idx2drop)

### 5. Save clean data

In [57]:
listings.to_csv(data_dir + 'listings_clean.csv.gz', index=False)