In [115]:
import pandas as pd
import numpy as np

In [116]:
# Import the data directly from google drive instead of saving it locally!

url = 'https://docs.google.com/spreadsheets/d/1ecopK6oyyb4d_7-QLrCr8YlgFrCetHU7-VQfnYej7JY/export?format=xlsx'
airbnb = pd.ExcelFile(url)

# List all sheet names
print(airbnb.sheet_names)

['amsterdam_weekdays', 'amsterdam_weekends', 'athens_weekdays', 'athens_weekends', 'berlin_weekends', 'berlin_weekdays', 'barcelona_weekdays', 'barcelona_weekends', 'budapest_weekdays', 'budapest_weekends', 'lisbon_weekdays', 'lisbon_weekends', 'london_weekdays', 'london_weekends', 'paris_weekdays', 'paris_weekends', 'rome_weekdays', 'rome_weekends', 'vienna_weekdays', 'vienna_weekends']


In [117]:
# Verify import by loading one sheet

airbnbdf= pd.read_excel(airbnb, sheet_name='amsterdam_weekdays')
print(airbnbdf.head())
print(airbnbdf.shape)

   Unnamed: 0     realSum     room_type  room_shared  room_private  \
0           0  194.033698  Private room        False          True   
1           1  344.245776  Private room        False          True   
2           2  264.101422  Private room        False          True   
3           3  433.529398  Private room        False          True   
4           4  485.552926  Private room        False          True   

   person_capacity  host_is_superhost  multi  biz  cleanliness_rating  \
0                2              False      1    0                  10   
1                4              False      0    0                   8   
2                2              False      0    1                   9   
3                4              False      0    1                   9   
4                2               True      0    0                  10   

   guest_satisfaction_overall  bedrooms      dist  metro_dist  attr_index  \
0                          93         1  5.022964    2.539380  

In [118]:
# Merge all sheets into a single DataFrame with an additional column for sheet names

merged_airbnbdf= pd.concat(
    [airbnb.parse(sheet).assign(sheet_name=sheet) for sheet in airbnb.sheet_names],
    ignore_index=True
)

# Reset index cleanly (only on merged DataFrame)
merged_airbnbdf.reset_index(drop=True, inplace=True)

print(merged_airbnbdf.shape)
print(merged_airbnbdf.head())

(51707, 21)
   Unnamed: 0     realSum     room_type  room_shared  room_private  \
0           0  194.033698  Private room        False          True   
1           1  344.245776  Private room        False          True   
2           2  264.101422  Private room        False          True   
3           3  433.529398  Private room        False          True   
4           4  485.552926  Private room        False          True   

   person_capacity  host_is_superhost  multi  biz  cleanliness_rating  ...  \
0                2              False      1    0                  10  ...   
1                4              False      0    0                   8  ...   
2                2              False      0    1                   9  ...   
3                4              False      0    1                   9  ...   
4                2               True      0    0                  10  ...   

   bedrooms      dist  metro_dist  attr_index  attr_index_norm  rest_index  \
0         1  5.02296

Change the name of realSum to Price.
Remove the first column.
	- Advanced cleaning
Instead of having two columns of room_shared and room_private, create one with the respective categories.
Instead of dummy variables, create a column where one can know if the host of the listing has only one, two to four, or more than four listings.


# Basic cleaning
(code only for changed / optimized snippets)


In [119]:
# OPTIMIZED
# Create a column for city, country, and weekday/weekend.

merged_airbnbdf['city'] = merged_airbnbdf['sheet_name'].str[:-9:].str.capitalize()

In [121]:
# OPTIMIZED

merged_airbnbdf['day_type'] = merged_airbnbdf['sheet_name'].str[-8::].str.capitalize()

# check
merged_airbnbdf[['sheet_name', 'day_type']].head()

Unnamed: 0,sheet_name,day_type
0,amsterdam_weekdays,Weekdays
1,amsterdam_weekdays,Weekdays
2,amsterdam_weekdays,Weekdays
3,amsterdam_weekdays,Weekdays
4,amsterdam_weekdays,Weekdays


In [122]:
# ALTERNATIVE as it might be easier to comprehend this way; however, your proposal can be kept, @Assad

# creating the new empty column
merged_airbnbdf['room_category'] = pd.Series(dtype='str')

# filling in values conditionally
merged_airbnbdf.loc[merged_airbnbdf['room_private'] == True, 'room_category'] = 'Private'
merged_airbnbdf.loc[merged_airbnbdf['room_shared'] == True, 'room_category'] = 'Shared'
merged_airbnbdf['room_category'] = merged_airbnbdf['room_category'].fillna('Other')


# the rest can remain as it is:
# Drop the 3rd and 4th columns (index 2 and 3)
#...

# check
merged_airbnbdf['room_category'].describe()

count     51707
unique        3
top       Other
freq      32648
Name: room_category, dtype: object

In [123]:
# a similar alternative approach can be tought of for the task on "Create 'listings by host' based on 'multi' and 'biz' columns"