In [144]:
import pandas as pd
import numpy as np
from datetime import datetime

In [145]:
df_listing = pd.read_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\Airbnb\listings.csv')
df_calendar = pd.read_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\Airbnb\calendar.csv')


In [146]:
display(df_calendar.head())
display(df_listing.head())

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,6400,2023-12-19,f,$100.00,,4.0,5.0
1,6400,2023-12-20,f,$100.00,,4.0,5.0
2,6400,2023-12-21,f,$100.00,,4.0,5.0
3,6400,2023-12-22,f,$100.00,,4.0,5.0
4,6400,2023-12-23,f,$100.00,,4.0,5.0


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,6400,Rental unit in Milan · ★4.89 · 3 bedrooms · 1 bed · 3.5 baths,13822,Francesca,,TIBALDI,45.44119,9.17813,Private room,100.0,4,10,2019-04-13,0.06,1,358,0,
1,23986,Rental unit in Milan · ★4.64 · 1 bedroom · 1 bed · 1 bath,95941,Jeremy,,NAVIGLI,45.44806,9.17373,Entire home/apt,150.0,1,26,2023-07-29,0.18,1,359,4,
2,24107,Condo in Milan · ★4.50 · 1 bedroom · 6 beds · 1 bath,46951,Valeria,,CITTA' STUDI,45.47179,9.23669,Entire home/apt,100.0,1,4,2013-07-31,0.02,1,365,0,
3,40470,Rental unit in Milan · ★4.67 · 2 bedrooms · 4 beds · 1 bath,174203,Giacinto,,VIALE MONZA,45.52023,9.22747,Entire home/apt,80.0,3,41,2023-09-09,0.26,2,290,3,
4,304050,Rental unit in Milan · ★4.91 · 1 bedroom · 1 bed · 1 private bath,1566887,Elena,,XXII MARZO,45.45709,9.21531,Private room,49.0,2,264,2023-11-21,1.92,2,0,33,


### Cleaning the unwanted clumns from listing

In [147]:
df_listing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24936 entries, 0 to 24935
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              24936 non-null  int64  
 1   name                            24936 non-null  object 
 2   host_id                         24936 non-null  int64  
 3   host_name                       24936 non-null  object 
 4   neighbourhood_group             0 non-null      float64
 5   neighbourhood                   24936 non-null  object 
 6   latitude                        24936 non-null  float64
 7   longitude                       24936 non-null  float64
 8   room_type                       24936 non-null  object 
 9   price                           21023 non-null  float64
 10  minimum_nights                  24936 non-null  int64  
 11  number_of_reviews               24936 non-null  int64  
 12  last_review                     

In [148]:
unwanted = ['license','availability_365','host_name','neighbourhood_group']
df_listing.drop(columns=unwanted, inplace=True)

### Keep only the listings that have at least one review in 2023

In [149]:
df_listing['last_review'] = pd.to_datetime(df_listing['last_review'], format=r'%Y-%m-%d')

In [150]:
year2023 = datetime(2023,1,1)

df_listing = df_listing[df_listing['last_review'] > year2023]

In [151]:
df_listing.head()

Unnamed: 0,id,name,host_id,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,number_of_reviews_ltm
1,23986,Rental unit in Milan · ★4.64 · 1 bedroom · 1 bed · 1 bath,95941,NAVIGLI,45.44806,9.17373,Entire home/apt,150.0,1,26,2023-07-29,0.18,1,4
3,40470,Rental unit in Milan · ★4.67 · 2 bedrooms · 4 beds · 1 bath,174203,VIALE MONZA,45.52023,9.22747,Entire home/apt,80.0,3,41,2023-09-09,0.26,2,3
4,304050,Rental unit in Milan · ★4.91 · 1 bedroom · 1 bed · 1 private bath,1566887,XXII MARZO,45.45709,9.21531,Private room,49.0,2,264,2023-11-21,1.92,2,33
5,46536,Rental unit in Milan · ★4.53 · 2 bedrooms · 3 beds · 1 bath,138683,VIALE MONZA,45.52276,9.22478,Entire home/apt,110.0,3,35,2023-10-30,0.24,1,5
12,333223,Rental unit in Milano · ★5.0 · 1 bedroom · 2 beds · 1 shared bath,1697947,MAGENTA - S. VITTORE,45.46097,9.16431,Private room,150.0,1,3,2023-04-19,0.02,5,1


Some listing in df_listing don't have price. I use the mean of the price for the next 365 days (in df_calendar) as a proxy for 'price.'

This way, I'll drop as few rows as possible.

In [152]:
# Convert price to float32

df_calendar['price'] = df_calendar['price'].str.replace('$', '')
df_calendar['price'] = df_calendar['price'].str.replace(',', '')

df_calendar['price'] = df_calendar['price'].astype('float32')

In [153]:
grouped_price = df_calendar.groupby('listing_id')['price'].mean().to_frame().reset_index()
grouped_price.head(3)

Unnamed: 0,listing_id,price
0,6400,100.0
1,23986,150.0
2,24107,100.0


In [154]:
df_listing = pd.merge(df_listing, grouped_price,
            left_on='id', right_on='listing_id',
            how='left')

df_listing['price'] = df_listing['price_x'].fillna(df_listing['price_y'])
df_listing.drop(columns=['price_x','price_y','listing_id'], inplace=True)
df_listing.head()

Unnamed: 0,id,name,host_id,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,number_of_reviews_ltm,price
0,23986,Rental unit in Milan · ★4.64 · 1 bedroom · 1 bed · 1 bath,95941,NAVIGLI,45.44806,9.17373,Entire home/apt,1,26,2023-07-29,0.18,1,4,150.0
1,40470,Rental unit in Milan · ★4.67 · 2 bedrooms · 4 beds · 1 bath,174203,VIALE MONZA,45.52023,9.22747,Entire home/apt,3,41,2023-09-09,0.26,2,3,80.0
2,304050,Rental unit in Milan · ★4.91 · 1 bedroom · 1 bed · 1 private bath,1566887,XXII MARZO,45.45709,9.21531,Private room,2,264,2023-11-21,1.92,2,33,49.0
3,46536,Rental unit in Milan · ★4.53 · 2 bedrooms · 3 beds · 1 bath,138683,VIALE MONZA,45.52276,9.22478,Entire home/apt,3,35,2023-10-30,0.24,1,5,110.0
4,333223,Rental unit in Milano · ★5.0 · 1 bedroom · 2 beds · 1 shared bath,1697947,MAGENTA - S. VITTORE,45.46097,9.16431,Private room,1,3,2023-04-19,0.02,5,1,150.0


### Create a column with the star number

In [155]:
pattern = r'★([\d\.]+)'

df_listing['stars'] = df_listing['name'].str.findall(pattern).str.get(0)
df_listing.head()

Unnamed: 0,id,name,host_id,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,number_of_reviews_ltm,price,stars
0,23986,Rental unit in Milan · ★4.64 · 1 bedroom · 1 bed · 1 bath,95941,NAVIGLI,45.44806,9.17373,Entire home/apt,1,26,2023-07-29,0.18,1,4,150.0,4.64
1,40470,Rental unit in Milan · ★4.67 · 2 bedrooms · 4 beds · 1 bath,174203,VIALE MONZA,45.52023,9.22747,Entire home/apt,3,41,2023-09-09,0.26,2,3,80.0,4.67
2,304050,Rental unit in Milan · ★4.91 · 1 bedroom · 1 bed · 1 private bath,1566887,XXII MARZO,45.45709,9.21531,Private room,2,264,2023-11-21,1.92,2,33,49.0,4.91
3,46536,Rental unit in Milan · ★4.53 · 2 bedrooms · 3 beds · 1 bath,138683,VIALE MONZA,45.52276,9.22478,Entire home/apt,3,35,2023-10-30,0.24,1,5,110.0,4.53
4,333223,Rental unit in Milano · ★5.0 · 1 bedroom · 2 beds · 1 shared bath,1697947,MAGENTA - S. VITTORE,45.46097,9.16431,Private room,1,3,2023-04-19,0.02,5,1,150.0,5.0


In [156]:
df_listing.iloc[4,1]

'Rental unit in Milano · ★5.0 · 1 bedroom · 2 beds · 1 shared bath'

In [157]:
df_listing.iloc[2,1]

'Rental unit in Milan · ★4.91 · 1 bedroom · 1 bed · 1 private bath'

In [158]:
# Some listing have for example 1.5 beds.
# For now I get the 1.5.

pattern_bedroom = r'([\d\.]*) bedr'

df_listing['bedroom'] = df_listing['name'].str.findall(pattern_bedroom).str.get(0)

In [159]:
pattern_bed = r'([\d\.]*) beds? '

df_listing['bed'] = df_listing['name'].str.findall(pattern_bed).str.get(0)

In [160]:
# After an iterative process I found that
# there are only bath, private bath, and shared bath

pattern_bath = r'([\d\.]*)[\sprivate]*[\sshared]* bath'

df_listing['bath'] = df_listing['name'].str.findall(pattern_bath).str.get(0)

### Adjust the order and the dtypes

In [161]:
new_order = ['id','name','price','room_type','number_of_reviews','stars','bed',
             'bedroom','bath','neighbourhood','latitude', 'longitude','minimum_nights',
             'last_review','reviews_per_month','number_of_reviews_ltm',
             'calculated_host_listings_count', 'host_id',]

df_listing = df_listing[new_order]

In [175]:
df_listing.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15070 entries, 0 to 15258
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              15070 non-null  int64         
 1   name                            15070 non-null  object        
 2   price                           15070 non-null  float32       
 3   room_type                       15070 non-null  object        
 4   number_of_reviews               15070 non-null  float32       
 5   stars                           12977 non-null  float32       
 6   bed                             15070 non-null  float32       
 7   bedroom                         15070 non-null  float32       
 8   bath                            15044 non-null  float32       
 9   neighbourhood                   15070 non-null  object        
 10  latitude                        15070 non-null  float32       
 11  longitu

In [163]:
col_to_convert = ['stars','price','number_of_reviews','bed','bedroom','bath',
                  'minimum_nights','number_of_reviews_ltm','calculated_host_listings_count',
                  'latitude','longitude','reviews_per_month']

df_listing[col_to_convert] = df_listing[col_to_convert].astype('float32')

### Handle the last null columns

In [164]:
pd.set_option('display.max_colwidth', None)

In [165]:
df_listing['id'].duplicated().any()

False

In [166]:
# Some listings don't have stars, for now I keep the null values

df_listing.loc[df_listing['stars'].isna(), 'name'][:10]

68         Vacation home in Milan · 1 bedroom · 1 bed · 1 bath
198          Rental unit in Milan · 1 bedroom · 1 bed · 1 bath
289         Rental unit in Milan · 1 bedroom · 2 beds · 1 bath
326         Rental unit in Milan · 1 bedroom · 2 beds · 1 bath
330                Home in Milano · 1 bedroom · 1 bed · 1 bath
361    Boutique hotel in Milan · 1 bedroom · 1 bed · 1.5 baths
463            Condo in Milano · 3 bedrooms · 3 beds · 2 baths
498        Rental unit in Milano · 1 bedroom · 2 beds · 1 bath
515              Home in Milano · 2 bedrooms · 4 beds · 1 bath
542              Condo in Milano · 1 bedroom · 2 beds · 1 bath
Name: name, dtype: object

In [167]:
df_listing.loc[df_listing['bed'].isna(), 'name'][:10]

8              Rental unit in Milan · ★4.85 · 1 bedroom · 1 bath
119                   Loft in Milan · ★4.56 · 1 bedroom · 1 bath
370             Rental unit in Milan · ★5.0 · 1 bedroom · 1 bath
410                  Loft in Milano · ★4.54 · 1 bedroom · 1 bath
549           Rental unit in Milano · ★4.94 · 1 bedroom · 1 bath
580            Rental unit in Milan · ★4.77 · 1 bedroom · 1 bath
741            Rental unit in Milan · ★4.77 · 1 bedroom · 1 bath
925     Guesthouse in Milano · ★4.87 · 1 bedroom · 1 shared bath
1093          Rental unit in Milano · ★4.98 · 1 bedroom · 1 bath
1148           Guest suite in Milan · ★4.94 · 1 bedroom · 1 bath
Name: name, dtype: object

In [171]:
# It's really strange a home with a bedroom but with no beds.
# Since there are few rows with this problem, I just drop them.

df_listing.dropna(subset=['bed'], inplace=True)

In [173]:
# It makes sense that some homes don't have a bedroom.
# I fill the na values with 0.

df_listing.loc[df_listing['bedroom'].isna(), 'name'][:10]

12            Rental unit in Milan · ★4.54 · Studio · 1 bed · 1 bath
25                 Condo in Milan · ★4.74 · Studio · 2 beds · 1 bath
56     Serviced apartment in Milan · ★5.0 · Studio · 1 bed · 0 baths
122           Rental unit in Milan · ★4.80 · Studio · 1 bed · 1 bath
150          Rental unit in Milan · ★4.77 · Studio · 2 beds · 1 bath
205           Rental unit in Milan · ★4.79 · Studio · 1 bed · 1 bath
219          Rental unit in Milano · ★4.90 · Studio · 1 bed · 1 bath
241          Rental unit in Milano · ★4.94 · Studio · 1 bed · 1 bath
248         Rental unit in Milano · ★4.89 · Studio · 2 beds · 1 bath
259          Rental unit in Milan · ★4.75 · Studio · 2 beds · 1 bath
Name: name, dtype: object

In [174]:
df_listing['bedroom'] = df_listing['bedroom'].fillna(0)

In [182]:
# I handle the strange listings with, for example, 1.5 beds

def is_not_whole(n):

    '''This function will return a boolean,
    depending on whether n is a whole number.'''

    return n % 1 != 0

for i in ['bath','bed','bedroom']:
    print(f'Non whole rows for {i}:', df_listing[i].apply(is_not_whole).sum())

Non whole rows for bath: 547
Non whole rows for bed: 0
Non whole rows for bedroom: 0


In [187]:
# I round the number to the lower whole number

df_listing['bath'] = np.floor(df_listing['bath'])

In [188]:
df_listing.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15070 entries, 0 to 15258
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              15070 non-null  int64         
 1   name                            15070 non-null  object        
 2   price                           15070 non-null  float32       
 3   room_type                       15070 non-null  object        
 4   number_of_reviews               15070 non-null  float32       
 5   stars                           12977 non-null  float32       
 6   bed                             15070 non-null  float32       
 7   bedroom                         15070 non-null  float32       
 8   bath                            15044 non-null  float32       
 9   neighbourhood                   15070 non-null  object        
 10  latitude                        15070 non-null  float32       
 11  longitu

### Convert the cleaned dataset to CSV

In [189]:
df_listing.to_csv(r'C:\Users\matte\OneDrive\Desktop\GitHub\data\Airbnb\listings_cleaned.csv',
                  index=False)