In [None]:
import pandas as pd
import numpy as np

# Load the Excel file from the given URL
url = 'https://docs.google.com/spreadsheets/d/1ecopK6oyyb4d_7-QLrCr8YlgFrCetHU7-VQfnYej7JY/export?format=xlsx'
airbnb = pd.ExcelFile(url)

# List all sheets
print(airbnb.sheet_names)

['amsterdam_weekdays', 'amsterdam_weekends', 'athens_weekdays', 'athens_weekends', 'berlin_weekends', 'berlin_weekdays', 'barcelona_weekdays', 'barcelona_weekends', 'budapest_weekdays', 'budapest_weekends', 'lisbon_weekdays', 'lisbon_weekends', 'london_weekdays', 'london_weekends', 'paris_weekdays', 'paris_weekends', 'rome_weekdays', 'rome_weekends', 'vienna_weekdays', 'vienna_weekends']


In [None]:
# Read a specific sheet into a DataFrame
airbnbdf= pd.read_excel(airbnb, sheet_name='amsterdam_weekdays')
print(airbnbdf.head())
print(airbnbdf.shape)

   Unnamed: 0     realSum     room_type  room_shared  room_private  \
0           0  194.033698  Private room        False          True   
1           1  344.245776  Private room        False          True   
2           2  264.101422  Private room        False          True   
3           3  433.529398  Private room        False          True   
4           4  485.552926  Private room        False          True   

   person_capacity  host_is_superhost  multi  biz  cleanliness_rating  \
0                2              False      1    0                  10   
1                4              False      0    0                   8   
2                2              False      0    1                   9   
3                4              False      0    1                   9   
4                2               True      0    0                  10   

   guest_satisfaction_overall  bedrooms      dist  metro_dist  attr_index  \
0                          93         1  5.022964    2.539380  

In [None]:
# Merge all sheets into a single DataFrame with an additional column for sheet names
merged_airbnbdf= pd.concat(
    [airbnb.parse(sheet).assign(sheet_name=sheet) for sheet in airbnb.sheet_names],
    ignore_index=True
)

# Reset index cleanly (only on merged DataFrame)
merged_airbnbdf.reset_index(drop=True, inplace=True)

print(merged_airbnbdf.shape)
print(merged_airbnbdf.head())

(51707, 21)
   Unnamed: 0     realSum     room_type  room_shared  room_private  \
0           0  194.033698  Private room        False          True   
1           1  344.245776  Private room        False          True   
2           2  264.101422  Private room        False          True   
3           3  433.529398  Private room        False          True   
4           4  485.552926  Private room        False          True   

   person_capacity  host_is_superhost  multi  biz  cleanliness_rating  ...  \
0                2              False      1    0                  10  ...   
1                4              False      0    0                   8  ...   
2                2              False      0    1                   9  ...   
3                4              False      0    1                   9  ...   
4                2               True      0    0                  10  ...   

   bedrooms      dist  metro_dist  attr_index  attr_index_norm  rest_index  \
0         1  5.02296

In [64]:
# Split the sheet_name column into city name
merged_airbnbdf['city'] = merged_airbnbdf['sheet_name'].str.split('_').str[0].str.capitalize()

# Define a mapping of city to country
city_to_country = {
    'Amsterdam': 'Netherlands',
    'Athens': 'Greece',
    'Berlin': 'Germany',
    'Barcelona': 'Spain',
    'Budapest': 'Hungary',
    'Lisbon': 'Portugal',
    'London': 'United Kingdom',
    'Paris': 'France',
    'Rome': 'Italy',
    'Vienna': 'Austria'
}

# Map city to country
merged_airbnbdf['country'] = merged_airbnbdf['city'].map(city_to_country)

# Extract day type from sheet_name
merged_airbnbdf['day_type'] = merged_airbnbdf['sheet_name'].str.split('_').str[1].str.capitalize()


# Print the updated DataFrame
print(merged_airbnbdf.head())
print(merged_airbnbdf.shape)



   Unnamed: 0     realSum     room_type  room_shared  room_private  \
0           0  194.033698  Private room        False          True   
1           1  344.245776  Private room        False          True   
2           2  264.101422  Private room        False          True   
3           3  433.529398  Private room        False          True   
4           4  485.552926  Private room        False          True   

   person_capacity  host_is_superhost  multi  biz  cleanliness_rating  ...  \
0                2              False      1    0                  10  ...   
1                4              False      0    0                   8  ...   
2                2              False      0    1                   9  ...   
3                4              False      0    1                   9  ...   
4                2               True      0    0                  10  ...   

   attr_index  attr_index_norm  rest_index  rest_index_norm      lng  \
0   78.690379         4.166708   98.25

In [65]:
# Change column name 'realSum' to 'Price' and print columns
merged_airbnbdf.rename(columns={'realSum': 'Price'}, inplace=True)
print(merged_airbnbdf.columns)

Index(['Unnamed: 0', 'Price', 'room_type', 'room_shared', 'room_private',
       'person_capacity', 'host_is_superhost', 'multi', 'biz',
       'cleanliness_rating', 'guest_satisfaction_overall', 'bedrooms', 'dist',
       'metro_dist', 'attr_index', 'attr_index_norm', 'rest_index',
       'rest_index_norm', 'lng', 'lat', 'sheet_name', 'city', 'country',
       'day_type'],
      dtype='object')


In [66]:
# Drop the 'Unnamed: 0' column
merged_airbnbdf.drop('Unnamed: 0', axis=1, inplace=True)
print(merged_airbnbdf.columns)

Index(['Price', 'room_type', 'room_shared', 'room_private', 'person_capacity',
       'host_is_superhost', 'multi', 'biz', 'cleanliness_rating',
       'guest_satisfaction_overall', 'bedrooms', 'dist', 'metro_dist',
       'attr_index', 'attr_index_norm', 'rest_index', 'rest_index_norm', 'lng',
       'lat', 'sheet_name', 'city', 'country', 'day_type'],
      dtype='object')


In [67]:
# Create 'room_category' based on 'room_shared' and 'room_private'
merged_airbnbdf['room_category'] = np.select(
    [
        merged_airbnbdf['room_shared'] == True,
        merged_airbnbdf['room_private'] == True
    ],
    ['Shared', 'Private'],
    default='Other'
)

# Drop the 3rd and 4th columns (index 2 and 3)
merged_airbnbdf.drop(merged_airbnbdf.columns[[2, 3]], axis=1, inplace=True)

# Move 'room_category' to 3rd position column
cols = list(merged_airbnbdf.columns)
cols.insert(2, cols.pop(cols.index('room_category')))
merged_airbnbdf = merged_airbnbdf[cols]

# Print the updated DataFrame
print(merged_airbnbdf.head())
print(merged_airbnbdf.shape)
print (merged_airbnbdf.columns)

        Price     room_type room_category  person_capacity  host_is_superhost  \
0  194.033698  Private room       Private                2              False   
1  344.245776  Private room       Private                4              False   
2  264.101422  Private room       Private                2              False   
3  433.529398  Private room       Private                4              False   
4  485.552926  Private room       Private                2               True   

   multi  biz  cleanliness_rating  guest_satisfaction_overall  bedrooms  ...  \
0      1    0                  10                          93         1  ...   
1      0    0                   8                          85         1  ...   
2      0    1                   9                          87         1  ...   
3      0    1                   9                          90         2  ...   
4      0    0                  10                          98         1  ...   

   attr_index  attr_index_norm  

In [None]:
# Create 'listings by host' based on 'multi' and 'biz' columns
merged_airbnbdf['listings by host'] = np.select(
    [
        (merged_airbnbdf['multi'] == 0) & (merged_airbnbdf['biz'] == 0),
        merged_airbnbdf['multi'] == 1,
        merged_airbnbdf['biz'] == 1
    ],
    [
        '1',
        '2-4',
        '4+'
    ],
    default='Unknown'
)

# Drop the 5th and 6th columns (index 5 and 6)
merged_airbnbdf.drop(merged_airbnbdf.columns[[5, 6]], axis=1, inplace=True)

# Move 'listings by host' to 5th position column
cols = list(merged_airbnbdf.columns)
cols.insert(5, cols.pop(cols.index('listings by host')))
merged_airbnbdf = merged_airbnbdf[cols]


# Check result
print(merged_airbnbdf.head())
print(merged_airbnbdf.shape)
print(merged_airbnbdf.columns)

        Price     room_type room_category  person_capacity  host_is_superhost  \
0  194.033698  Private room       Private                2              False   
1  344.245776  Private room       Private                4              False   
2  264.101422  Private room       Private                2              False   
3  433.529398  Private room       Private                4              False   
4  485.552926  Private room       Private                2               True   

  listings by host  cleanliness_rating  guest_satisfaction_overall  bedrooms  \
0              2-4                  10                          93         1   
1                1                   8                          85         1   
2               4+                   9                          87         1   
3               4+                   9                          90         2   
4                1                  10                          98         1   

       dist  ...  attr_index  at

In [69]:
# Readjust some column positions
cols = list(merged_airbnbdf.columns)
cols.insert(0, cols.pop(cols.index('sheet_name')))
cols.insert(1, cols.pop(cols.index('country')))
cols.insert(2, cols.pop(cols.index('city')))
cols.insert(3, cols.pop(cols.index('day_type')))

merged_airbnbdf = merged_airbnbdf[cols]

# Rename a single column
merged_airbnbdf.rename(columns={'dist': 'citycenter_dist'}, inplace=True)

# Standardize all column names to lowercase and replace spaces with underscores
merged_airbnbdf.columns = merged_airbnbdf.columns.str.lower().str.replace(' ', '_')

# Check result
print(merged_airbnbdf.head())
print(merged_airbnbdf.shape)
print(merged_airbnbdf.columns)



           sheet_name      country       city  day_type       price  \
0  amsterdam_weekdays  Netherlands  Amsterdam  Weekdays  194.033698   
1  amsterdam_weekdays  Netherlands  Amsterdam  Weekdays  344.245776   
2  amsterdam_weekdays  Netherlands  Amsterdam  Weekdays  264.101422   
3  amsterdam_weekdays  Netherlands  Amsterdam  Weekdays  433.529398   
4  amsterdam_weekdays  Netherlands  Amsterdam  Weekdays  485.552926   

      room_type room_category  person_capacity  host_is_superhost  \
0  Private room       Private                2              False   
1  Private room       Private                4              False   
2  Private room       Private                2              False   
3  Private room       Private                4              False   
4  Private room       Private                2               True   

  listings_by_host  ...  guest_satisfaction_overall  bedrooms  \
0              2-4  ...                          93         1   
1                1  ...     

In [70]:
# Check for duplicate rows
merged_airbnbdf.duplicated().any()


False

In [71]:
# Final info check after data cleaning  
merged_airbnbdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51707 entries, 0 to 51706
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   sheet_name                  51707 non-null  object 
 1   country                     51707 non-null  object 
 2   city                        51707 non-null  object 
 3   day_type                    51707 non-null  object 
 4   price                       51707 non-null  float64
 5   room_type                   51707 non-null  object 
 6   room_category               51707 non-null  object 
 7   person_capacity             51707 non-null  int64  
 8   host_is_superhost           51707 non-null  bool   
 9   listings_by_host            51707 non-null  object 
 10  cleanliness_rating          51707 non-null  int64  
 11  guest_satisfaction_overall  51707 non-null  int64  
 12  bedrooms                    51707 non-null  int64  
 13  citycenter_dist             517