#Airbnb data for 250,000+ listings in 10 major cities, including information about hosts, pricing, location, and room type, along with over 5 million historical reviews.
NOTE: Prices are in local currency
#Recommended Analysis
* Can you spot any major differences in the Airbnb market between cities?
* Which attributes have the biggest influence on price?
* Are you able to identify any trends or seasonality in the review data?
* Which city offers a better value for travel?


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt

In [None]:
df=pd.read_csv("/kaggle/input/airbnb-listings-reviews/Airbnb Data/Listings.csv",encoding='latin',low_memory=False)
df.head()

In [None]:
df.info()

In [None]:
#Lets reduce the memory size by changing datatypes
# Convert object columns to categorical types
cat_cols = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood', 'district', 'city', 'property_type', 'room_type', 'instant_bookable']
df[cat_cols] = df[cat_cols].astype('category')

In [None]:
df.info() #hence we have reduce the memory 

In [None]:
df.columns

In [None]:
duplicate_rows = df[df.duplicated()]
duplicate_rows


In [None]:
# List of column names to check for null values
columns_to_check = ['listing_id', 'name', 'host_id', 'host_since', 'host_location',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_total_listings_count',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'district', 'city', 'latitude', 'longitude', 'property_type',
       'room_type', 'accommodates', 'bedrooms', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'instant_bookable']

# Checking for null values in the specified columns
null_values_rows = df[df[columns_to_check].isnull().all(axis=1)]
null_values_rows




In [None]:
# 1. Property Type Distribution
property_type_distribution = df.groupby(['city', 'property_type']).size().unstack(fill_value=0)
property_type_distribution

# 1.Can you spot any major differences in the Airbnb market between cities?

In [None]:
 #Average Prices by City
average_prices = df.groupby('city')['price'].mean()
average_prices

In [None]:
#Average Prices by City
avg_prices = df.groupby('city')['price'].mean().reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(x='city', y='price', data=avg_prices)
plt.title('Average Prices of Listings by City')
plt.xlabel('City')
plt.ylabel('Average Price')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Group the data by city and instant_bookable status and count the number of listings
city_instant_bookable_counts = df.groupby(['city', 'instant_bookable']).size().unstack(fill_value=0)

# Sort the counts of instant_bookable status in descending order for each city
sorted_city_instant_bookable_counts = city_instant_bookable_counts.sum(axis=1).sort_values(ascending=False)

# Plot the sorted counts
plt.figure(figsize=(10, 6))
sorted_city_instant_bookable_counts.plot(kind='bar', color='skyblue')
plt.title('Number of Listings by Instant Bookable Status for Each City')
plt.xlabel('City')
plt.ylabel('Number of Listings')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


# 2.Which attributes have the biggest influence on price?

In [None]:
# Select numerical attributes and price
numerical_attributes = ['accommodates', 'bedrooms', 'minimum_nights', 'maximum_nights',
                        'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
                        'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
                        'review_scores_value']
numerical_df = df[numerical_attributes + ['price']]

# Calculate correlation matrix
correlation_matrix = numerical_df.corr()

# Sort correlation with 'price' in descending order
price_correlation = correlation_matrix['price'].sort_values(ascending=False)

# Print attributes with highest correlation with price
print(price_correlation)


In [None]:
# Plotting the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Attributes')
plt.show()


# 3.Which city offers a better value for travel?

In [None]:
# Calculate average price for accommodations in each city
average_price_by_city = df.groupby('city')['price'].mean().sort_values()

# Display the average prices for accommodations in each city
print(average_price_by_city)


In [None]:
# Set the figure size
plt.figure(figsize=(10, 6))

# Create a bar plot of average prices by city
sns.barplot(x=average_price_by_city.values, y=average_price_by_city.index, palette='viridis')

# Add labels and title
plt.xlabel('Average Price')
plt.ylabel('City')
plt.title('Average Accommodation Prices by City')

# Show plot
plt.show()


# 4.Are you able to identify any trends or seasonality in the review data?

In [None]:
# Load the dataset
# df = pd.read_csv('your_dataset.csv')  # Replace 'your_dataset.csv' with the actual filename

# Convert 'host_since' column to datetime
df['host_since'] = pd.to_datetime(df['host_since'])

# Extract year and month from 'host_since' column
df['year_month'] = df['host_since'].dt.to_period('M')

# Count the number of hosts joined each month
hosts_per_month = df['year_month'].value_counts().sort_index()

# Plotting the trend
plt.figure(figsize=(10, 6))
hosts_per_month.plot(kind='line', marker='o', color='b')
plt.title('Number of Hosts Joined Airbnb Over Time')
plt.xlabel('Year-Month')
plt.ylabel('Number of Hosts Joined')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Group by city and count the number of bookable listings
city_bookable_count = df.groupby("city").agg({"city": "count"}).rename(columns={"city": "Bookable Count"})

# Plotting the bar plot
plt.figure(figsize=(15, 8))
city_bookable_count.plot(kind='bar', ax=plt.gca())
plt.xlabel('City')
plt.ylabel('Bookable Count')
plt.title('Bookable count as per cities')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
