## Data Loading

In [66]:
import pandas as pd

# Load the Excel file from the given URL
url = 'https://docs.google.com/spreadsheets/d/1ecopK6oyyb4d_7-QLrCr8YlgFrCetHU7-VQfnYej7JY/export?format=xlsx'
airbnb = pd.ExcelFile(url)

In [67]:
# Merge all sheets into a single DataFrame with an additional column for sheet names
merged_airbnb= pd.concat(
    [airbnb.parse(sheet).assign(sheet_name=sheet) for sheet in airbnb.sheet_names],
    ignore_index=True
)

## 1)Dataset Overview

In [68]:
#Sample of the merged DataFrame
merged_airbnb.head()

Unnamed: 0.1,Unnamed: 0,realSum,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,...,bedrooms,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,sheet_name
0,0,194.033698,Private room,False,True,2,False,1,0,10,...,1,5.022964,2.53938,78.690379,4.166708,98.253896,6.846473,4.90569,52.41772,amsterdam_weekdays
1,1,344.245776,Private room,False,True,4,False,0,0,8,...,1,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928,4.90005,52.37432,amsterdam_weekdays
2,2,264.101422,Private room,False,True,2,False,0,1,9,...,1,5.748312,3.651621,75.275877,3.985908,95.386955,6.6467,4.97512,52.36103,amsterdam_weekdays
3,3,433.529398,Private room,False,True,4,False,0,1,9,...,2,0.384862,0.439876,493.272534,26.119108,875.033098,60.973565,4.89417,52.37663,amsterdam_weekdays
4,4,485.552926,Private room,False,True,2,True,0,0,10,...,1,0.544738,0.318693,552.830324,29.272733,815.30574,56.811677,4.90051,52.37508,amsterdam_weekdays


In [69]:
#Information about the DataFrame
merged_airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51707 entries, 0 to 51706
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  51707 non-null  int64  
 1   realSum                     51707 non-null  float64
 2   room_type                   51707 non-null  object 
 3   room_shared                 51707 non-null  bool   
 4   room_private                51707 non-null  bool   
 5   person_capacity             51707 non-null  int64  
 6   host_is_superhost           51707 non-null  bool   
 7   multi                       51707 non-null  int64  
 8   biz                         51707 non-null  int64  
 9   cleanliness_rating          51707 non-null  int64  
 10  guest_satisfaction_overall  51707 non-null  int64  
 11  bedrooms                    51707 non-null  int64  
 12  dist                        51707 non-null  float64
 13  metro_dist                  517

In [70]:
#Number of rows and Columns
merged_airbnb.shape

(51707, 21)

## 2)Dataset Structure

In [71]:
Structure = pd.DataFrame({
    'Column name': merged_airbnb.columns,
    'Data type': merged_airbnb.dtypes.values,
    'Non-null count': merged_airbnb.notnull().sum().values,
    'Unique values': [merged_airbnb[col].nunique() for col in merged_airbnb.columns],
    'Example values': [merged_airbnb[col].dropna().unique()[:3] for col in merged_airbnb.columns]
})

Structure

Unnamed: 0,Column name,Data type,Non-null count,Unique values,Example values
0,Unnamed: 0,int64,51707,5379,"[0, 1, 2]"
1,realSum,float64,51707,10497,"[194.033698122934, 344.245776017622, 264.10142..."
2,room_type,object,51707,3,"[Private room, Entire home/apt, Shared room]"
3,room_shared,bool,51707,2,"[False, True]"
4,room_private,bool,51707,2,"[True, False]"
5,person_capacity,int64,51707,5,"[2, 4, 3]"
6,host_is_superhost,bool,51707,2,"[False, True]"
7,multi,int64,51707,2,"[1, 0]"
8,biz,int64,51707,2,"[0, 1]"
9,cleanliness_rating,int64,51707,9,"[10, 8, 9]"


## 3) Data Cleaning

In [72]:
merged_airbnb['City'] = merged_airbnb['sheet_name'].str.split('_').str[0].str.capitalize()
merged_airbnb

Unnamed: 0.1,Unnamed: 0,realSum,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,...,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,sheet_name,City
0,0,194.033698,Private room,False,True,2,False,1,0,10,...,5.022964,2.539380,78.690379,4.166708,98.253896,6.846473,4.90569,52.41772,amsterdam_weekdays,Amsterdam
1,1,344.245776,Private room,False,True,4,False,0,0,8,...,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928,4.90005,52.37432,amsterdam_weekdays,Amsterdam
2,2,264.101422,Private room,False,True,2,False,0,1,9,...,5.748312,3.651621,75.275877,3.985908,95.386955,6.646700,4.97512,52.36103,amsterdam_weekdays,Amsterdam
3,3,433.529398,Private room,False,True,4,False,0,1,9,...,0.384862,0.439876,493.272534,26.119108,875.033098,60.973565,4.89417,52.37663,amsterdam_weekdays,Amsterdam
4,4,485.552926,Private room,False,True,2,True,0,0,10,...,0.544738,0.318693,552.830324,29.272733,815.305740,56.811677,4.90051,52.37508,amsterdam_weekdays,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51702,1794,715.938574,Entire home/apt,False,False,6,False,0,1,10,...,0.530181,0.135447,219.402478,15.712158,438.756874,10.604584,16.37940,48.21136,vienna_weekends,Vienna
51703,1795,304.793960,Entire home/apt,False,False,2,False,0,0,8,...,0.810205,0.100839,204.970121,14.678608,342.182813,8.270427,16.38070,48.20296,vienna_weekends,Vienna
51704,1796,637.168969,Entire home/apt,False,False,2,False,0,0,10,...,0.994051,0.202539,169.073402,12.107921,282.296424,6.822996,16.38568,48.20460,vienna_weekends,Vienna
51705,1797,301.054157,Private room,False,True,2,False,0,0,10,...,3.044100,0.287435,109.236574,7.822803,158.563398,3.832416,16.34100,48.19200,vienna_weekends,Vienna


In [73]:
city_to_country = {
    'Amsterdam': 'Netherlands',
    'Athens': 'Greece',
    'Berlin': 'Germany',
    'Barcelona': 'Spain',
    'Budapest': 'Hungary',
    'Lisbon': 'Portugal',
    'London': 'United Kingdom',
    'Paris': 'France',
    'Rome': 'Italy',
    'Vienna': 'Austria'
}
merged_airbnb['Country'] = merged_airbnb['City'].map(city_to_country)
merged_airbnb['Weekday/Weekend'] = merged_airbnb['sheet_name'].str.split('_').str[-1].str.capitalize().str.rstrip('s')
merged_airbnb

Unnamed: 0.1,Unnamed: 0,realSum,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,...,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,sheet_name,City,Country,Weekday/Weekend
0,0,194.033698,Private room,False,True,2,False,1,0,10,...,78.690379,4.166708,98.253896,6.846473,4.90569,52.41772,amsterdam_weekdays,Amsterdam,Netherlands,Weekday
1,1,344.245776,Private room,False,True,4,False,0,0,8,...,631.176378,33.421209,837.280757,58.342928,4.90005,52.37432,amsterdam_weekdays,Amsterdam,Netherlands,Weekday
2,2,264.101422,Private room,False,True,2,False,0,1,9,...,75.275877,3.985908,95.386955,6.646700,4.97512,52.36103,amsterdam_weekdays,Amsterdam,Netherlands,Weekday
3,3,433.529398,Private room,False,True,4,False,0,1,9,...,493.272534,26.119108,875.033098,60.973565,4.89417,52.37663,amsterdam_weekdays,Amsterdam,Netherlands,Weekday
4,4,485.552926,Private room,False,True,2,True,0,0,10,...,552.830324,29.272733,815.305740,56.811677,4.90051,52.37508,amsterdam_weekdays,Amsterdam,Netherlands,Weekday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51702,1794,715.938574,Entire home/apt,False,False,6,False,0,1,10,...,219.402478,15.712158,438.756874,10.604584,16.37940,48.21136,vienna_weekends,Vienna,Austria,Weekend
51703,1795,304.793960,Entire home/apt,False,False,2,False,0,0,8,...,204.970121,14.678608,342.182813,8.270427,16.38070,48.20296,vienna_weekends,Vienna,Austria,Weekend
51704,1796,637.168969,Entire home/apt,False,False,2,False,0,0,10,...,169.073402,12.107921,282.296424,6.822996,16.38568,48.20460,vienna_weekends,Vienna,Austria,Weekend
51705,1797,301.054157,Private room,False,True,2,False,0,0,10,...,109.236574,7.822803,158.563398,3.832416,16.34100,48.19200,vienna_weekends,Vienna,Austria,Weekend


In [74]:
merged_airbnb.rename(columns={'realSum': 'Price'}, inplace=True)
merged_airbnb.drop(columns='Unnamed: 0',inplace=True)
merged_airbnb.head()

Unnamed: 0,Price,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,guest_satisfaction_overall,...,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,sheet_name,City,Country,Weekday/Weekend
0,194.033698,Private room,False,True,2,False,1,0,10,93,...,78.690379,4.166708,98.253896,6.846473,4.90569,52.41772,amsterdam_weekdays,Amsterdam,Netherlands,Weekday
1,344.245776,Private room,False,True,4,False,0,0,8,85,...,631.176378,33.421209,837.280757,58.342928,4.90005,52.37432,amsterdam_weekdays,Amsterdam,Netherlands,Weekday
2,264.101422,Private room,False,True,2,False,0,1,9,87,...,75.275877,3.985908,95.386955,6.6467,4.97512,52.36103,amsterdam_weekdays,Amsterdam,Netherlands,Weekday
3,433.529398,Private room,False,True,4,False,0,1,9,90,...,493.272534,26.119108,875.033098,60.973565,4.89417,52.37663,amsterdam_weekdays,Amsterdam,Netherlands,Weekday
4,485.552926,Private room,False,True,2,True,0,0,10,98,...,552.830324,29.272733,815.30574,56.811677,4.90051,52.37508,amsterdam_weekdays,Amsterdam,Netherlands,Weekday


In [75]:
def host_type(row):
    if row['multi']:
        return 'Two to Four'
    elif row['biz']:
        return 'More than Four'
    else:
        return 'Only One'
merged_airbnb['Host_listings'] = merged_airbnb.apply(host_type, axis=1)
merged_airbnb = merged_airbnb.drop(columns=['multi', 'biz'])
host_list = merged_airbnb.pop('Host_listings')
merged_airbnb.insert(5,'Host_listings', host_list)
merged_airbnb.head()

Unnamed: 0,Price,room_type,room_shared,room_private,person_capacity,Host_listings,host_is_superhost,cleanliness_rating,guest_satisfaction_overall,bedrooms,...,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,sheet_name,City,Country,Weekday/Weekend
0,194.033698,Private room,False,True,2,Two to Four,False,10,93,1,...,78.690379,4.166708,98.253896,6.846473,4.90569,52.41772,amsterdam_weekdays,Amsterdam,Netherlands,Weekday
1,344.245776,Private room,False,True,4,Only One,False,8,85,1,...,631.176378,33.421209,837.280757,58.342928,4.90005,52.37432,amsterdam_weekdays,Amsterdam,Netherlands,Weekday
2,264.101422,Private room,False,True,2,More than Four,False,9,87,1,...,75.275877,3.985908,95.386955,6.6467,4.97512,52.36103,amsterdam_weekdays,Amsterdam,Netherlands,Weekday
3,433.529398,Private room,False,True,4,More than Four,False,9,90,2,...,493.272534,26.119108,875.033098,60.973565,4.89417,52.37663,amsterdam_weekdays,Amsterdam,Netherlands,Weekday
4,485.552926,Private room,False,True,2,Only One,True,10,98,1,...,552.830324,29.272733,815.30574,56.811677,4.90051,52.37508,amsterdam_weekdays,Amsterdam,Netherlands,Weekday


In [76]:
# Move multiple columns at beginning
cols = list(merged_airbnb.columns)

cols_to_move = ['Country', 'City', 'Weekday/Weekend']

# Remove them
for col in cols_to_move:
    cols.remove(col)

# Insert them starting at position 1
for i, col in enumerate(cols_to_move):
    cols.insert(i, col)

# Reorder dataframe
merged_airbnb = merged_airbnb[cols]


merged_airbnb = merged_airbnb.drop('sheet_name', axis=1)

merged_airbnb.head()


Unnamed: 0,Country,City,Weekday/Weekend,Price,room_type,room_shared,room_private,person_capacity,Host_listings,host_is_superhost,...,guest_satisfaction_overall,bedrooms,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat
0,Netherlands,Amsterdam,Weekday,194.033698,Private room,False,True,2,Two to Four,False,...,93,1,5.022964,2.53938,78.690379,4.166708,98.253896,6.846473,4.90569,52.41772
1,Netherlands,Amsterdam,Weekday,344.245776,Private room,False,True,4,Only One,False,...,85,1,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928,4.90005,52.37432
2,Netherlands,Amsterdam,Weekday,264.101422,Private room,False,True,2,More than Four,False,...,87,1,5.748312,3.651621,75.275877,3.985908,95.386955,6.6467,4.97512,52.36103
3,Netherlands,Amsterdam,Weekday,433.529398,Private room,False,True,4,More than Four,False,...,90,2,0.384862,0.439876,493.272534,26.119108,875.033098,60.973565,4.89417,52.37663
4,Netherlands,Amsterdam,Weekday,485.552926,Private room,False,True,2,Only One,True,...,98,1,0.544738,0.318693,552.830324,29.272733,815.30574,56.811677,4.90051,52.37508


In [77]:
#Round decimals

for col in merged_airbnb.select_dtypes(include=['float', 'int']).columns:
    if merged_airbnb[col].dtype == 'float':
        merged_airbnb[col] = merged_airbnb[col].round(2)
    else:
        merged_airbnb[col] = merged_airbnb[col].round(0)


merged_airbnb.head()

Unnamed: 0,Country,City,Weekday/Weekend,Price,room_type,room_shared,room_private,person_capacity,Host_listings,host_is_superhost,...,guest_satisfaction_overall,bedrooms,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat
0,Netherlands,Amsterdam,Weekday,194.03,Private room,False,True,2,Two to Four,False,...,93,1,5.02,2.54,78.69,4.17,98.25,6.85,4.91,52.42
1,Netherlands,Amsterdam,Weekday,344.25,Private room,False,True,4,Only One,False,...,85,1,0.49,0.24,631.18,33.42,837.28,58.34,4.9,52.37
2,Netherlands,Amsterdam,Weekday,264.1,Private room,False,True,2,More than Four,False,...,87,1,5.75,3.65,75.28,3.99,95.39,6.65,4.98,52.36
3,Netherlands,Amsterdam,Weekday,433.53,Private room,False,True,4,More than Four,False,...,90,2,0.38,0.44,493.27,26.12,875.03,60.97,4.89,52.38
4,Netherlands,Amsterdam,Weekday,485.55,Private room,False,True,2,Only One,True,...,98,1,0.54,0.32,552.83,29.27,815.31,56.81,4.9,52.38


## 4) Descriptive Statistics

In [78]:
#Descriptive statistics of the DataFrame
merged_airbnb.describe().round(2)

Unnamed: 0,Price,person_capacity,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat
count,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0,51707.0
mean,279.88,3.16,9.39,92.63,1.16,3.19,0.68,294.2,13.42,626.86,22.79,7.43,45.67
std,327.95,1.3,0.95,8.95,0.63,2.39,0.86,224.75,9.81,497.92,17.8,9.8,5.25
min,34.78,2.0,2.0,20.0,0.0,0.02,0.0,15.15,0.93,19.58,0.59,-9.23,37.95
25%,148.75,2.0,9.0,90.0,1.0,1.45,0.25,136.8,6.38,250.85,8.75,-0.07,41.4
50%,211.34,3.0,10.0,95.0,1.0,2.61,0.41,234.33,11.47,522.05,17.54,4.87,47.51
75%,319.69,4.0,10.0,99.0,1.0,4.26,0.74,385.76,17.42,832.63,32.97,13.52,51.47
max,18545.45,6.0,10.0,100.0,10.0,25.28,14.27,4513.56,100.0,6696.16,100.0,23.79,52.64


In [79]:
#Category Columns
category_columns = merged_airbnb.select_dtypes(include=['object']).columns
category_columns

Index(['Country', 'City', 'Weekday/Weekend', 'room_type', 'Host_listings'], dtype='object')

In [82]:

cat_summary = {}

for col in category_columns:
    vc = merged_airbnb[col].value_counts(dropna=False)
    cat_summary[col] = [
        merged_airbnb[col].count(),
        merged_airbnb[col].nunique(dropna=False),
        vc.idxmax(),
        vc.max(),
        vc.idxmin(),
        vc.min()
    ]

cat_summary_table = pd.DataFrame(
    cat_summary,
    index=[
        'Count',
        'Number of unique values',
        'Most frequent value',
        'Most frequent value (frequency)',
        'Least frequent value',
        'Least frequent value (frequency)'
    ]
)

cat_summary_table


Unnamed: 0,Country,City,Weekday/Weekend,room_type,Host_listings
Count,51707,51707,51707,51707,51707
Number of unique values,10,10,2,3,3
Most frequent value,United Kingdom,London,Weekend,Entire home/apt,Only One
Most frequent value (frequency),9993,9993,26207,32648,18534
Least frequent value,Netherlands,Amsterdam,Weekday,Shared room,Two to Four
Least frequent value (frequency),2080,2080,25500,366,15065
