In [6]:
from pathlib import Path
import pandas as pd
import re

import warnings
warnings.filterwarnings('ignore')

from IPython.display import display
pd.options.display.max_columns = None

In [7]:
HOME = Path().resolve().parent.parent
data_dir = str(HOME) + '/data/processed/'

In [8]:
df = pd.read_csv(data_dir + 'listings_notna.csv.gz', parse_dates=['last_scraped'])
df.shape

(550752, 25)

In [9]:
listings = df.copy()

In [10]:
listings.head()

Unnamed: 0,availability_60,CasosCovMesAnt,neighbourhood_group_cleansed,has_availability,longitude,beds,id,last_scraped,bedrooms,accommodates,room_type,number_of_reviews,CasosCovMes,file_date,price,availability_90,amenities,latitude,availability_30,availability_365,review_scores_rating,calendar_last_scraped,bathrooms,reviews_per_month,property_type
0,0.0,0.0,Sant Martí,t,2.185545,4.0,18666.0,2019-01-15,2.0,6.0,Entire home/apt,1.0,0.0,1901,$130.00,0.0,"{TV,Internet,Wifi,""Air conditioning"",""Wheelcha...",41.408886,0.0,6,80.0,2019-01-15,1.0,0.03,Apartment
1,27.0,0.0,Eixample,t,2.173058,6.0,18674.0,2019-01-14,3.0,8.0,Entire home/apt,5.0,0.0,1901,$60.00,55.0,"{TV,Internet,Wifi,""Air conditioning"",""Wheelcha...",41.404197,12.0,326,85.0,2019-01-14,2.0,0.07,Apartment
2,23.0,0.0,Horta-Guinardó,t,2.170701,2.0,31377.0,2019-01-14,1.0,2.0,Private room,4.0,0.0,1901,$42.00,30.0,"{Wifi,""Air conditioning"",Kitchen,""Paid parking...",41.410969,14.0,184,95.0,2019-01-14,1.0,0.09,Apartment
3,38.0,0.0,Horta-Guinardó,t,2.170819,2.0,31380.0,2019-01-14,1.0,3.0,Private room,39.0,0.0,1901,$53.00,47.0,"{Wifi,""Air conditioning"",Kitchen,""Paid parking...",41.4109,24.0,204,87.0,2019-01-14,1.0,0.9,Apartment
4,52.0,0.0,Gràcia,t,2.159376,1.0,31958.0,2019-01-14,1.0,4.0,Entire home/apt,151.0,0.0,1901,$60.00,80.0,"{TV,Wifi,""Air conditioning"",Kitchen,Elevator,H...",41.409498,25.0,342,91.0,2019-01-14,1.0,1.57,Apartment


### 1. Dropping useless columns

In [11]:
cols2drop = ['has_availability', 'calendar_last_scraped']

# All values of 'has_availability' == 't'
# The date in 'calendar_last_scraped' is equal to 'last_scraped'
listings = listings.drop(columns=cols2drop)

### 2. Dealing with numerical columns

In [12]:
numerical_features = ['beds','bedrooms', 'number_of_reviews', 'price', 'longitude', 'latitude', 
                      'bathrooms', 'id', 'reviews_per_month', 'availability_30', 'availability_60', 
                     'availability_90', 'availability_365', 'accommodates', 'review_scores_rating']

In [13]:
float2int = ['beds','bedrooms', 'number_of_reviews', 'bathrooms', 'id', 'availability_30', 
             'availability_60', 'availability_90', 'availability_365', 'accommodates']

listings[float2int] = listings[float2int].applymap(lambda x: int(x))

Remove any symbol different than digit and '.' from numerical float columns:

In [14]:
ncols2clean = ['price', 'reviews_per_month', 'review_scores_rating']

regex = re.compile(r'(\d+\.\d{1,2})')
listings[ncols2clean] = listings[ncols2clean].applymap(lambda x: pd.to_numeric(re.search(regex, x).group(0)))

### 3. Removing listings with no avaibility

In [15]:
idx2drop = listings[
    (listings['availability_30'] == 0) & 
    (listings['availability_60'] == 0) &
    (listings['availability_90'] == 0) & 
    (listings['availability_365'] == 0)].index

In [16]:
print('Removing {} rows from the dataset.'.format(len(idx2drop)))

Removing 91957 rows from the dataset.


In [17]:
unavailable_listings = listings.loc[idx2drop]
listings = listings.drop(idx2drop)

### 5. Save clean data

In [18]:
listings.to_csv(data_dir + 'listings_clean.csv.gz', index=False)

In [19]:
unavailable_listings.to_csv(data_dir + 'unavailable_listings.csv.gz', index=False)