# Common Functions 

Importiamo e puliamo i dati in un unico notebook.

Comodo per non dover tutte le volte rifare la stessa cosa

- Import dei dati
  - listings
  - reviews
  - calendar
- Pulizia dei dati
- Salvataggio finale

In [27]:
import pandas as pd
import numpy as np
import os

In [28]:
#Check Dataframe Utility function
def check_df(dataframe, sample=False):
    
    print(f"Dataframe Shape: {dataframe.shape} with rows: {dataframe.shape[0]} and columns: {dataframe.shape[1]}")
    print(f"\nDF Columns: \n{list(dataframe.columns)}")
    if sample == True:
        print(f"\nData:\n{dataframe.head(5)}")
    
    return None
    

# Import dei dati

In [29]:
folder_path = os.path.abspath("..")
data_dir = 'dataset/rome-airbnb'

#listings
filename_listings = 'listings.csv'
filepath_listings = os.path.join(folder_path, data_dir, filename_listings)

#reviews
filename_reviews = 'reviews.csv'
filepath_reviews = os.path.join(folder_path, data_dir, filename_reviews)

#calendar
filename_calendar = 'calendar.csv'
filepath_calendar = os.path.join(folder_path, data_dir, filename_calendar)


### Import Listings

In [30]:
%%time
#Import listings
listings_raw = pd.read_csv(filepath_listings)
check_df(listings_raw)

Dataframe Shape: (3818, 92) with rows: 3818 and columns: 92

DF Columns: 
['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes', 'transit', 'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market', 'smart_location', 'country_code', 'country', 'latitude', 'longitude', 'is_location_exact', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet', 'price', 

### Import Reviews

In [31]:
%time
#Import reviews
sample_dim = 100000

reviews_raw = pd.read_csv(filepath_reviews).head(sample_dim)
check_df(reviews_raw)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.29 µs
Dataframe Shape: (84849, 6) with rows: 84849 and columns: 6

DF Columns: 
['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments']


### Import Calendar

In [32]:
%time
calendar_raw = pd.read_csv(filepath_calendar).head(sample_dim)
check_df(calendar_raw)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.77 µs
Dataframe Shape: (100000, 4) with rows: 100000 and columns: 4

DF Columns: 
['listing_id', 'date', 'available', 'price']


# Pulizia Dati

## Pulizia Listings

In [33]:
# Selezioniamo solo alcune delle colonne
listings = listings_raw[[
    'id','name','longitude','latitude',
    'listing_url',
    'instant_bookable',
    'host_response_time',
    'review_scores_rating',
    'property_type',
    'room_type','accommodates',
    'bathrooms','bedrooms','beds','reviews_per_month','amenities',
    'number_of_reviews',
    'price'
  ]]
listings.head()

Unnamed: 0,id,name,longitude,latitude,listing_url,instant_bookable,host_response_time,review_scores_rating,property_type,room_type,accommodates,bathrooms,bedrooms,beds,reviews_per_month,amenities,number_of_reviews,price
0,241032,Stylish Queen Anne Apartment,-122.371025,47.636289,https://www.airbnb.com/rooms/241032,f,within a few hours,95.0,Apartment,Entire home/apt,4,1.0,1.0,1.0,4.07,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",207,$85.00
1,953595,Bright & Airy Queen Anne Apartment,-122.365666,47.639123,https://www.airbnb.com/rooms/953595,f,within an hour,96.0,Apartment,Entire home/apt,4,1.0,1.0,1.0,1.48,"{TV,Internet,""Wireless Internet"",Kitchen,""Free...",43,$150.00
2,3308979,New Modern House-Amazing water view,-122.369483,47.629724,https://www.airbnb.com/rooms/3308979,f,within a few hours,97.0,House,Entire home/apt,11,4.5,5.0,7.0,1.15,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",20,$975.00
3,7421966,Queen Anne Chateau,-122.369279,47.638473,https://www.airbnb.com/rooms/7421966,f,,,Apartment,Entire home/apt,3,1.0,0.0,2.0,,"{Internet,""Wireless Internet"",Kitchen,""Indoor ...",0,$100.00
4,278830,Charming craftsman 3 bdm house,-122.372471,47.632918,https://www.airbnb.com/rooms/278830,f,within an hour,92.0,House,Entire home/apt,6,2.0,3.0,3.0,0.89,"{TV,""Cable TV"",Internet,""Wireless Internet"",Ki...",38,$450.00


Price da stringa a numero reale

In [34]:
@np.vectorize
def remove_dollar(label: str):
    return float(label.replace('$','').replace(',',''))

#listings['price'] = remove_dollar(listings['price'])
listings = listings.assign(price = remove_dollar(listings.price))
listings[['price']]

Unnamed: 0,price
0,85.0
1,150.0
2,975.0
3,100.0
4,450.0
...,...
3813,359.0
3814,79.0
3815,93.0
3816,99.0


## Pulizia Reviews

In [35]:
#Date to datetime
reviews = reviews_raw.assign(date = pd.to_datetime(reviews_raw['date']))

Creazione delle variabili mese e anno

In [36]:
%timeit
reviews['year'] = reviews['date'].dt.year
reviews['month'] = reviews['date'].dt.month
reviews = reviews.sort_values(['year', 'month'], ascending=False)
reviews.shape

(84849, 8)

## Pulizia Calendar

In [38]:
calendar = calendar_raw.assign(date = pd.to_datetime(calendar_raw['date']))

Preparazione del prezzo

In [39]:
calendar = calendar.assign(
        price          = pd.to_numeric(calendar.price.str.replace('$','').str.replace(',','')),
        # adjusted_price = pd.to_numeric(calendar.adjusted_price.str.replace('$','').str.replace(',','')),
    )

  price          = pd.to_numeric(calendar.price.str.replace('$','').str.replace(',','')),


In [None]:
%timeit
calendar['year'] = pd.DatetimeIndex(calendar['date']).year
calendar['month'] = pd.DatetimeIndex(calendar['date']).month
calendar = calendar.sort_values(['year', 'month'], ascending=False)
calendar.shape

Mappatura dei valori logici

In [40]:
calendar['available'] = calendar.available.map({
    't': True,
    'f': False
})

# Final Export

Export finale dei dati e salvataggio

In [44]:
df_listing = listings.copy()
df_review = reviews.copy()
df_calendar = calendar.copy()

print("\n---Cleaning and Parsing of the data Completed---\n")
print(f"Listing parsed and saved with name: df_listing")
print(f"Review parsed and saved with name: df_review")
print(f"Calendar parsed and saved with name: df_calendar")


---Cleaning and Parsing of the data Completed---

Listing parsed and saved with name: df_listing
Review parsed and saved with name: df_review
Calendar parsed and saved with name: df_calendar
