### Data Loading & Merging

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import plotly
import warnings
warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns',None)


url = 'https://docs.google.com/spreadsheets/d/1ecopK6oyyb4d_7-QLrCr8YlgFrCetHU7-VQfnYej7JY/export?format=xlsx'
airbnb = pd.ExcelFile(url)

# List all sheets
print(airbnb.sheet_names)

In [None]:
airbnbdf= pd.read_excel(airbnb, sheet_name='amsterdam_weekdays')
print(airbnbdf.head())
print(airbnbdf.shape)

In [None]:
# Merge all sheets into a single DataFrame with an additional column for sheet names
merged_airbnbdf= pd.concat(
    [airbnb.parse(sheet).assign(sheet_name=sheet) for sheet in airbnb.sheet_names],
    ignore_index=True
)

# Reset index cleanly (only on merged DataFrame)
merged_airbnbdf.reset_index(drop=True, inplace=True)

print(merged_airbnbdf.shape)
print(merged_airbnbdf.head())

### Basic Data Cleansing

In [None]:
# Split the sheet_name column into city name
merged_airbnbdf['city'] = merged_airbnbdf['sheet_name'].str.split('_').str[0].str.capitalize()

# Define a mapping of city to country
city_to_country = {
    'Amsterdam': 'Netherlands',
    'Athens': 'Greece',
    'Berlin': 'Germany',
    'Barcelona': 'Spain',
    'Budapest': 'Hungary',
    'Lisbon': 'Portugal',
    'London': 'United Kingdom',
    'Paris': 'France',
    'Rome': 'Italy',
    'Vienna': 'Austria'
}

# Map city to country
merged_airbnbdf['country'] = merged_airbnbdf['city'].map(city_to_country)

# Extract day type from sheet_name
merged_airbnbdf['day_type'] = merged_airbnbdf['sheet_name'].str.split('_').str[1].str.capitalize()


# Print the updated DataFrame
print(merged_airbnbdf.head())
print(merged_airbnbdf.shape)



In [None]:
# Change column name 'realSum' to 'Price' and print columns
merged_airbnbdf.rename(columns={'realSum': 'Price'}, inplace=True)
print(merged_airbnbdf.columns)

In [None]:
# Drop the 'Unnamed: 0' column
merged_airbnbdf.drop('Unnamed: 0', axis=1, inplace=True)
print(merged_airbnbdf.columns)

### Advanced Cleaning

In [None]:
# Create 'room_category' based on 'room_shared' and 'room_private'

# Initialize the new column with empty strings
merged_airbnbdf['room_category'] = pd.Series(dtype='str')

# Filling in values conditionally
merged_airbnbdf.loc[merged_airbnbdf['room_private'] == True, 'room_category'] = 'Private'
merged_airbnbdf.loc[merged_airbnbdf['room_shared'] == True, 'room_category'] = 'Shared'
merged_airbnbdf['room_category'] = merged_airbnbdf['room_category'].fillna('Other')

# Drop the 3rd and 4th columns (index 2 and 3)
merged_airbnbdf.drop(merged_airbnbdf.columns[[2, 3]], axis=1, inplace=True)

# Move 'room_category' to 3rd position column
cols = list(merged_airbnbdf.columns)
cols.insert(2, cols.pop(cols.index('room_category')))
merged_airbnbdf = merged_airbnbdf[cols]

# Print the updated DataFrame
print(merged_airbnbdf.head())
print(merged_airbnbdf.shape)
print (merged_airbnbdf.columns)


In [None]:
# Create 'listings by host' based on 'multi' and 'biz' columns
merged_airbnbdf['listings by host'] = np.select(
    [
        (merged_airbnbdf['multi'] == 0) & (merged_airbnbdf['biz'] == 0),
        merged_airbnbdf['multi'] == 1,
        merged_airbnbdf['biz'] == 1
    ],
    [
        '1',
        '2-4',
        '4+'
    ],
    default='Unknown'
)

# Drop the 5th and 6th columns (index 5 and 6)
merged_airbnbdf.drop(merged_airbnbdf.columns[[5, 6]], axis=1, inplace=True)

# Move 'listings by host' to 5th position column
cols = list(merged_airbnbdf.columns)
cols.insert(5, cols.pop(cols.index('listings by host')))
merged_airbnbdf = merged_airbnbdf[cols]


# Check result
print(merged_airbnbdf.head())
print(merged_airbnbdf.shape)
print(merged_airbnbdf.columns)

In [None]:
# Readjust some column positions
cols = list(merged_airbnbdf.columns)
cols.insert(0, cols.pop(cols.index('sheet_name')))
cols.insert(1, cols.pop(cols.index('country')))
cols.insert(2, cols.pop(cols.index('city')))
cols.insert(3, cols.pop(cols.index('day_type')))

merged_airbnbdf = merged_airbnbdf[cols]

# Rename a single column
merged_airbnbdf.rename(columns={'dist': 'citycenter_dist'}, inplace=True)

# Standardize all column names to lowercase and replace spaces with underscores
merged_airbnbdf.columns = merged_airbnbdf.columns.str.lower().str.replace(' ', '_')

# Check result
print(merged_airbnbdf.head())
print(merged_airbnbdf.shape)
print(merged_airbnbdf.columns)



In [None]:
# Check for duplicate rows
merged_airbnbdf.duplicated().any()


### Basic Initial Inspection

In [None]:
# Display the first row of the cleaned DataFrame
merged_airbnbdf.head(1)

In [None]:
# Display the last row of the cleaned DataFrame
merged_airbnbdf.tail(1)

In [None]:
# Display a random sample of 7 rows from the cleaned DataFrame
random_sample = merged_airbnbdf.sample(7, random_state=42)
random_sample

In [None]:
# Checkd Datatypes  
merged_airbnbdf.info()

In [None]:
#Descriptive statistics of the DataFrame
merged_airbnbdf.describe().round(2)

In [None]:
#Category Columns
category_columns = merged_airbnbdf.select_dtypes(include=['object']).columns
category_columns

In [None]:

cat_summary = {}

for col in category_columns:
    vc = merged_airbnbdf[col].value_counts(dropna=False)
    cat_summary[col] = [
        merged_airbnbdf[col].count(),
        merged_airbnbdf[col].nunique(dropna=False),
        vc.idxmax(),
        vc.max(),
        vc.idxmin(),
        vc.min()
    ]

cat_summary_table = pd.DataFrame(
    cat_summary,
    index=[
        'Count',
        'Number of unique values',
        'Most frequent value',
        'Most frequent value (frequency)',
        'Least frequent value',
        'Least frequent value (frequency)'
    ]
)

cat_summary_table

### Exploratory Questions

In [None]:
#Q1. How many listings are in each city in total and also per type of day?

#listing per city
listing_per_city=merged_airbnbdf.groupby(['city']).size()
listing_per_city.to_frame(name='Total listings per city')


In [None]:
#Q1. How many listings are in each city per type of day?

#listing per city and day type
listing_per_city_daytype=merged_airbnbdf.groupby(['city','day_type']).size()
listing_per_city_daytype.to_frame(name='Total listings per city and day type')



In [None]:
#Q2. Which city has the biggest proportion of superhosts?

superhost = (
    merged_airbnbdf[merged_airbnbdf["host_is_superhost"]]
    .groupby('city')
    .size()
    .to_frame(name='Superhosts_per_city')
    .sort_values(by='Superhosts_per_city', ascending=False)
)

print("The city which has the biggest proportion of superhosts is:", superhost['Superhosts_per_city'].idxmax())
superhost

In [None]:
#Q3. Which cities have listings with more than four rooms?

# Print cities with bedrooms >= 4
print(
    "Cities with listings having 4 or more bedrooms:",
    ', '.join(merged_airbnbdf.loc[merged_airbnbdf["bedrooms"] >= 4, "city"].unique())
)
# Print count of listings with 4 or more bedrooms per city
merged_airbnbdf.loc[merged_airbnbdf['bedrooms'] >= 4, ['city', 'bedrooms']].groupby('city').size()


In [None]:
#Q4. Which city has the most entire home/apt type listings?

# Filter for 'Entire home/apt' listings and count per city
entire_home_counts = (
    merged_airbnbdf[merged_airbnbdf["room_type"] == "Entire home/apt"]
    .groupby("city")
    .size()
    .to_frame(name="Entire_home_count")
    .sort_values(by="Entire_home_count", ascending=False)
)

print(entire_home_counts)

# Print the city with the highest number
print(f"\nThe city with the most entire home/apt listings is: {entire_home_counts.index[0]}")


In [None]:
#Q5. Are ratings typically high across listings, or is there a wide variation? 
#a. Plot the distribution of guest_satisfaction_overall ratings.

# Visualizating for guest_satisfaction_overall through Histogram. 
plt.hist(merged_airbnbdf['guest_satisfaction_overall'], bins=10, color='skyblue', edgecolor='black')
plt.title('Distribution of Guest Satisfaction Ratings')
plt.xlabel('Guest Satisfaction Rating')
plt.ylabel('Number of Listings')
plt.show()

# Visualizating for guest_satisfaction_overall through Density Plot.
sns.kdeplot(merged_airbnbdf['guest_satisfaction_overall'], shade=True, color='skyblue')
plt.title('Density of Guest Satisfaction Ratings')
plt.xlabel('Guest Satisfaction Rating')
plt.ylabel('Density')
plt.show()

# Printing Quick Statistics for cleanliness_rating
merged_airbnbdf['guest_satisfaction_overall'].describe()

# Print Analysis
print("""
Analysis: Most Airbnb listings have high guest satisfaction ratings, with the bulk of listings 
clustered between 90 and 100. The distribution is skewed toward the top, with very few low-rated listings. 
This shows that guests are generally very satisfied across listings.
""")


In [None]:
#b. Examine the distribution of cleanliness_rating.

# Visualizating for cleanliness_rating through Histogram.
plt.hist(merged_airbnbdf['cleanliness_rating'], bins=20, color='lightgreen', edgecolor='black')
plt.title('Distribution of Cleanliness Ratings')
plt.xlabel('Cleanliness Rating')
plt.ylabel('Number of Listings')
plt.show()

# Visualizating for cleanliness_rating through Density Plot.
sns.kdeplot(merged_airbnbdf['cleanliness_rating'], shade=True, color='lightgreen', bw_adjust=0.5)
plt.title('Density of Cleanliness Ratings')
plt.xlabel('Cleanliness Rating')
plt.ylabel('Density')
plt.show()

# Printing Quick Statistics for cleanliness_rating
merged_airbnbdf['cleanliness_rating'].describe()

# Print Analysis:
print("""
    Analysis: The cleanliness ratings are extremely high and consistent across listings. 
    Most listings score 9 or 10, showing that hosts maintain a very high standard of cleanliness. 
    Also, there is no major variation, indicating that cleanliness is uniformly excellent across 
    the dataset.  
      """
      )




In [None]:
#Q6. How does person_capacity vary across listings? What is the most common capacity of listings?

# Visualizating for person_capacity through Count Plot.
plt.figure(figsize=(8,5))
sns.countplot(x='person_capacity', data=merged_airbnbdf, color='yellowgreen')
plt.title('Distribution of Person Capacity Across Listings')
plt.xlabel('Person Capacity')
plt.ylabel('Number of Listings')
plt.show()


# Most common capacity
most_common_capacity = merged_airbnbdf['person_capacity'].mode()[0]
print("""
The majority of AirBnb listings in the dataset are designed for 2 to 4 guests, with 2-person listings
being the most common. Listings for larger groups of 5 or 6 people are relatively rare. This suggests 
that the market is skewed toward small to medium-sized groups, likely reflecting typical traveler demand.
""")



In [None]:
# Q7. Plot the distribution of price (former realSum) for both weekday and weekend offers 
# a. check normality, skewness, modality

from scipy.stats import skew, kurtosis, shapiro

# Separate prices for weekdays and weekends
weekday_prices = merged_airbnbdf[merged_airbnbdf['day_type'] == 'Weekdays']['price']
weekend_prices = merged_airbnbdf[merged_airbnbdf['day_type'] == 'Weekends']['price']

# Take a random sample (e.g., 5000) from a large dataset
weekday_sample = merged_airbnbdf[merged_airbnbdf['day_type'] == 'Weekdays']['price'].sample(5000, random_state=1)
weekend_sample = merged_airbnbdf[merged_airbnbdf['day_type'] == 'Weekends']['price'].sample(5000, random_state=1)

# Plot for weekdays
sns.histplot(weekday_prices[weekday_prices < 5000], bins=30, kde=True, color='skyblue')
plt.title("Distribution of Weekday Prices")
plt.xlabel("Price")
plt.ylabel("Number of Listings")
plt.show()

print("Weekday Prices Skewness:", skew(weekday_prices))
print("Weekday Prices Kurtosis:", kurtosis(weekday_prices))
print("Weekday Prices Modality: Unimodal (The histrogram shows a single peak)")

# Shapiro-Wilk test for Weekday prices
stat_w, p_w = shapiro(weekday_sample)
print("Weekday Prices Shapiro-Wilk Test p-value:", p_w)
if p_w > 0.05:
    print("Weekday prices: Data looks normal")
else:
    print("Weekday prices: Data is not normal")


#  Plot for weekends
sns.histplot(weekend_prices[weekend_prices < 5000], bins=30, kde=True, color='salmon')
plt.title("Distribution of Weekend Prices")
plt.xlabel("Price")
plt.ylabel("Number of Listings")
plt.show()

print("Weekend Prices Skewness:", skew(weekend_prices))
print("Weekend Prices Kurtosis:", kurtosis(weekend_prices))
print("Weekend Prices Modality: Unimodal (The histrogram shows a single peak)")

stat_we, p_we = shapiro(weekend_sample)
print("Weekend Prices Shapiro-Wilk Test p-value:", p_we)
if p_we > 0.05:
    print("Weekend prices: Data looks normal")
else:
    print("Weekend prices: Data is not normal")

# Print Analysis
print("""
Analysis: The distribution of Airbnb prices for both weekdays and weekends is highly right-skewed, 
with skewness values of 24.1 for weekdays and 16.6 for weekends. Both distributions have 
extremely high kurtosis (939 for weekdays, 566 for weekends), indicating heavy tails and 
the presence of extreme outliers. The histograms show a unimodal distribution, with a single 
peak at lower price ranges. The Shapiro-Wilk test on a sample of 5,000 listings confirms that 
the price distributions are not normal (p-values << 0.05). Overall, most listings are clustered
at lower prices, but a few extremely high-priced listings create a long right tail.
      """)

In [None]:
#b. If skewed, consider using transformations (e.g. log); can you normalize it?

# Log transformation
weekday_prices_log = np.log(weekday_prices)
weekend_prices_log = np.log(weekend_prices)

# Take a random sample from log-transformed prices
weekday_sample_log = weekday_prices_log.sample(5000, random_state=1)
weekend_sample_log = weekend_prices_log.sample(5000, random_state=1)


# Plot transformed distributions
sns.histplot(weekday_prices_log, bins=30, kde=True, color='skyblue')
plt.title("Log-Transformed Distribution of Weekday Prices")
plt.xlabel("Log(Price)")
plt.ylabel("Number of Listings")
plt.show()

print("Weekday Prices Skewness:", skew(weekday_prices_log))
print("Weekday Prices Kurtosis:", kurtosis(weekday_prices_log))
print("Weekday Prices Modality: Unimodal (The histrogram shows a single peak)")

# Shapiro-Wilk test
stat, p = shapiro(weekday_sample_log)
print("Weekday Prices (log) Shapiro-Wilk Test p-value:", p)
if p > 0.05:
    print("Weekday prices (log): Data looks approximately normal")
else:
    print("Weekday prices (log): Data is not normal")


sns.histplot(weekend_prices_log, bins=30, kde=True, color='salmon')
plt.title("Log-Transformed Distribution of Weekend Prices")
plt.xlabel("Log(Price)")
plt.ylabel("Number of Listings")
plt.show()

print("Weekend Prices Skewness:", skew(weekend_prices_log))
print("Weekend Prices Kurtosis:", kurtosis(weekend_prices_log))
print("Weekend Prices Modality: Unimodal (The histrogram shows a single peak)")
# Shapiro-Wilk test
stat, p = shapiro(weekend_sample_log)
print("Weekend Prices (log) Shapiro-Wilk Test p-value:", p)
if p > 0.05:
    print("Weekend prices (log): Data looks approximately normal")
else:
    print("Weekend prices (log): Data is not normal")


# Print Analysis
print("""
Analysis: The log-transformed distribution of Airbnb prices for both weekdays and weekends 
is much closer to a symmetric shape, with skewness reduced to around 0.76 for weekdays and 0.66 for 
weekends, and lower kurtosis values (1.52 for weekdays, 1.10 for weekends). Both distributions 
remain unimodal, with a single peak visible in the histograms. While the Shapiro-Wilk test still 
indicates non-normality (p-values << 0.05), the log transformation effectively reduces the influence 
of extreme high-priced listings.
        """)



In [None]:
#Q.8 Is there a difference in price per night between two cities? Choose at least three pairs of cities during weekdays.
from scipy.stats import ttest_ind

# Filter weekday listings
weekday_df = merged_airbnbdf[merged_airbnbdf['day_type'] == 'Weekdays']

# City pairs
city_pairs = [('Amsterdam', 'Berlin'), ('Paris', 'London'), ('Rome', 'Barcelona')]

# Loop through each pair
for city1, city2 in city_pairs:
    prices1 = weekday_df[weekday_df['city'] == city1]['price']
    prices2 = weekday_df[weekday_df['city'] == city2]['price']
    
    # Plot KDE for the pair
    sns.kdeplot(prices1, label=city1, fill=True)
    sns.kdeplot(prices2, label=city2, fill=True)
    plt.title(f"Weekday Prices: {city1} vs {city2}")
    plt.xlabel("Price")
    plt.ylabel("Density")
    plt.legend()
    plt.show()
    
    # Perform Welch's t-test
    stat, p = ttest_ind(prices1, prices2, equal_var=False)
    print(f"{city1} vs {city2}: t-test p-value = {p:.2f}")

# Print Analysis
print("""
Analysis: The t-test results indicate that there are statistically significant differences in weekday 
prices between all three pairs of cities (Amsterdam vs Berlin, Paris vs London, Rome vs Barcelona), 
with p-values effectively 0. This suggests that the average nightly price differs between these cities.
""")

In [None]:
#Q.9 Does the price per night tend to be higher on weekends?

# Calculate mean prices
weekday_mean = merged_airbnbdf[merged_airbnbdf['day_type'] == 'Weekdays']['price'].mean()
weekend_mean = merged_airbnbdf[merged_airbnbdf['day_type'] == 'Weekends']['price'].mean()

print(f"Average weekday price: {weekday_mean:.2f}")
print(f"Average weekend price: {weekend_mean:.2f}")

# Boxplot to visualize distributions
sns.boxplot(x='day_type', y='price', data=merged_airbnbdf)
plt.title("Price per Night: Weekdays vs Weekends")
plt.ylabel("Price")
plt.xlabel("Day Type")
plt.ylim(0, 3000)


# Print Analysis
print(""" 
Analysis: As per analysis, the average price per night is certainly higher on weekends compared to
weekdays, indicating that Airbnb hosts probably charge a premium during weekends, likely due to higher demand. 
The boxplot shows that weekday prices have more variation and outliers, meaning there are more very low
and very high prices on weekdays. Weekend prices, while higher on average, are more consistently clustered.
""")


In [None]:
#Q.10 Are listings of superhosts more expensive than those of normal hosts? 

# Calculate mean prices by Superhost status
superhost_mean = merged_airbnbdf[merged_airbnbdf['host_is_superhost'] == True]['price'].mean()
normalhost_mean = merged_airbnbdf[merged_airbnbdf['host_is_superhost'] == False]['price'].mean()

print(f"Average price for Superhost listings: {superhost_mean:.2f} €")
print(f"Average price for Normal host listings: {normalhost_mean:.2f} €")

# Boxplot to visualize price distributions
plt.figure(figsize=(8,6))
sns.boxplot(x='host_is_superhost', y='price', data=merged_airbnbdf)

# Adjust axis labels and title
plt.title("Price per Night: Superhost vs Normal Host")
plt.ylabel("Price (€)")
plt.xlabel("Superhost Status")
plt.xticks([0,1], ['Normal Host', 'Superhost'])
plt.ylim(0, 1000)



# Analysis
print(f"""
Analysis: The average superhosts listing price is {superhost_mean:.2f} €, while normal host 
listings average {normalhost_mean:.2f} €. This indicates that, on average, Normal Host listings 
are slightly more expensive than Superhost listings. Superhosts do not charge systematically 
higher prices, though Normal Hosts exhibit more extreme high-price outliers.
""")



In [None]:
#Q.11 Are superhosts closer to the city center and metro station than normal hosts?  

# Calculate mean distance values by Superhost status
superhost_citycenter = merged_airbnbdf[merged_airbnbdf['host_is_superhost'] == True]['citycenter_dist'].mean()
normal_citycenter = merged_airbnbdf[merged_airbnbdf['host_is_superhost'] == False]['citycenter_dist'].mean()

superhost_metro = merged_airbnbdf[merged_airbnbdf['host_is_superhost'] == True]['metro_dist'].mean()
normal_metro = merged_airbnbdf[merged_airbnbdf['host_is_superhost'] == False]['metro_dist'].mean()

print(f"Average distance to city center - Superhosts: {superhost_citycenter:.2f} km")
print(f"Average distance to city center - Normal hosts: {normal_citycenter:.2f} km")
print(f"Average distance to metro - Superhosts: {superhost_metro:.2f} km")
print(f"Average distance to metro - Normal hosts: {normal_metro:.2f} km")

data = {
    'Category': ['City Center', 'City Center', 'Metro', 'Metro'],
    'Host Type': ['Superhost', 'Normal Host', 'Superhost', 'Normal Host'],
    'Average Distance (km)': [superhost_citycenter, normal_citycenter, superhost_metro, normal_metro]
}

df_plot = pd.DataFrame(data)

# Bar graph visualization
plt.figure(figsize=(8,6))
sns.barplot(x='Category', y='Average Distance (km)', hue='Host Type', data=df_plot)
plt.title("Average Distance (km): Superhosts vs Normal Hosts")
plt.ylabel("Average Distance (km)")
plt.xlabel("Location Type")
plt.legend(title="Host Type")
plt.show()

In [None]:
#Q.12 Are superhosts cleaner than normal hosts?
avg_clean = merged_airbnbdf.groupby('host_is_superhost')['cleanliness_rating'].mean().round(2)
avg_clean = avg_clean.rename(index={True: 'Superhost', False: 'Normal Host'})

print(avg_clean)

# Bar graph
avg_clean.plot(kind='bar', color=['skyblue', 'lightgreen'])
plt.title("Average Cleanliness Rating: Superhost vs Normal Host")
plt.ylabel("Cleanliness Rating (1–10)")
plt.xlabel("Host Type")
plt.show()

In [None]:
#Q.13 Is there a pattern between room types and superhost status?
# taken inspiration from task 12

# Create a contingency table
room_superhost_table = pd.crosstab(merged_airbnbdf['room_type'],
                                  merged_airbnbdf['host_is_superhost'],
                                  normalize='columns') * 100

# Calculate raw counts
room_counts = pd.crosstab(merged_airbnbdf['room_type'],
                         merged_airbnbdf['host_is_superhost'])

# Visualize the distribution
plt.figure(figsize=(10, 6))
room_superhost_table.plot(kind='bar', stacked=False)
plt.title('Room Type Distribution: Superhost vs Normal Host')
plt.xlabel('Room Type')
plt.ylabel('Percentage')
plt.legend(title='Superhost Status', labels=['Normal Host', 'Superhost'])
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Print percentages
print("\nPercentage Distribution:")
print(room_superhost_table.round(2))

# Print raw counts
print("\nRaw Counts:")
print(room_counts)

print("""
Analysis: There appears to be a pattern between room types and superhost status:
1. Entire homes/apartments have a higher representation among superhosts
2. Private rooms show similar proportions between superhosts and normal hosts, with, relatively speaking, slightly more normal hosts
3. Shared rooms are less common among superhosts
This suggests that superhosts are more likely to rent out entire properties rather than shared spaces.
""")


In [None]:
#Q.14 Is renting an entire home/apt more expensive than a private room? Does that depend on the city?

# Calculate average prices by city and room type for entire homes and private rooms

price_by_city_room = merged_airbnbdf[merged_airbnbdf['room_type'].isin(['Entire home/apt', 'Private room'])] \
    .groupby(['city', 'room_type'])['price'] \
    .mean() \
    .unstack()

# Create bar plot
ax = price_by_city_room.plot(kind='bar', figsize=(12, 6), width=0.8)
plt.title('Average Price by City and Room Type')
plt.xlabel('City')
plt.ylabel('Average Price')
plt.legend(title='Room Type')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Calculate and print price difference percentage
price_diff_pct = ((price_by_city_room['Entire home/apt'] - price_by_city_room['Private room']) /
                  price_by_city_room['Private room'] * 100).round(2)
print("\nPrice premium for entire homes vs private rooms (%):")
print(price_diff_pct)

print("""
Analysis:
1. Entire homes/apartments are consistently more expensive than private rooms across all cities
2. The price premium varies significantly by city
3. Some cities show a larger gap between room types than others
4. London and Paris show the highest absolute price differences
""")