In [1]:
from pathlib import Path
import pandas as pd
import re

import warnings
warnings.filterwarnings('ignore')

from IPython.display import display
pd.options.display.max_columns = None

In [2]:
HOME = Path().resolve().parent.parent
data_dir = str(HOME) + '/data/meus dados/'

In [3]:
listings = pd.read_csv(data_dir + 'listings_notna_1908-2002.csv', parse_dates=['last_scraped'])
listings.shape

(142050, 44)

### 1. Dropping useless columns

In [4]:
cols2drop = ['experiences_offered', 'requires_license', 'is_business_travel_ready', 
             'has_availability', 'calendar_last_scraped']

# All values of 'experiences_offered' == 'none'
# All values of 'requires_license' == 't'
# All values of 'is_business_travel_ready' == 'f'
# All values of 'has_availability' == 't'
# The date in 'calendar_last_scraped' == 'last_scraped'
listings = listings.drop(columns=cols2drop)

### 2. Dealing with numerical columns

Remove the dollar symbol from numerical columns:

In [5]:
ncols2clean = ['price', 'security_deposit', 'cleaning_fee', 'extra_people']

In [6]:
listings[ncols2clean] = listings[ncols2clean].applymap(lambda x: float(x.replace('$', '').replace(',', '')))

Clean string garbage from the 'review_scores_rating' column (float type):

In [7]:
# This regex take anything different of number and '.' 
regex = r'[^\d^\.]'

listings['review_scores_rating'] = listings['review_scores_rating'].map(
    lambda x: pd.to_numeric(re.sub(regex, '', x)) if isinstance(x, str) else x)

### 3. Removing listings with no avaibility

In [8]:
idx2drop = listings[
    (listings['availability_30'] == 0) & 
    (listings['availability_60'] == 0) &
    (listings['availability_90'] == 0) & 
    (listings['availability_365'] == 0)].index

In [9]:
listings = listings.drop(idx2drop)

### 4. Replacing string by numerical values

In [10]:
bool_mapper = {'f': 0, 't': 1}

listings[['require_guest_profile_picture', 'require_guest_phone_verification', 'is_location_exact']] = \
listings[['require_guest_profile_picture', 'require_guest_phone_verification', 'is_location_exact']].replace(bool_mapper)

### 5. Save clean data

In [11]:
listings.to_csv(data_dir + 'listings_clean_1908-2002.csv', index=False)