# Airbnb Data Analysis
Cleaned & Structured Jupyter Notebook

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


## 2. Load Dataset

In [None]:
file_path = "https://gitlab.crio.do/me_notebook/me_jupyter_airbnbanalysis/-/raw/master/Airbnb_data.csv"
df = pd.read_csv(file_path)
df.head()

## 3. Data Cleaning

In [None]:
df.info()
df.isnull().sum()

df['reviews_per_month'].fillna(0, inplace=True)
df.drop(columns=['last_review'], inplace=True)
df['name'].fillna('unknown', inplace=True)
df['host_name'].fillna('unknown', inplace=True)
df.isnull().sum()

## 4. Outlier Capping

In [None]:
# Cap outliers for number_of_reviews
Q1 = df["number_of_reviews"].quantile(0.25)
Q3 = df["number_of_reviews"].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df['number_of_reviews'] = df['number_of_reviews'].clip(lower, upper)

# Cap outliers for price
Q1 = df["price"].quantile(0.25)
Q3 = df["price"].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df['price'] = df['price'].clip(lower, upper)

## 5. Room Type Popularity Across Neighbourhoods

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data=df, x='neighbourhood_group', hue='room_type')
plt.title('Most Popular Room Type Across Neighbourhood Groups')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 6. Price Range Preferences

In [None]:
bins = [0,100,200,300,500,1000,np.inf]
labels = ['$0-100','$101-200','$201-300','$301-500','$501-1000','$1000+']
df['price_range'] = pd.cut(df['price'], bins=bins, labels=labels)

plt.figure(figsize=(12,6))
sns.countplot(data=df, x='neighbourhood_group', hue='price_range')
plt.title('Price Range Preferences by Neighbourhood Group')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 7. Top 15 Neighbourhoods by Reviews

In [None]:
top_reviews = df.groupby('neighbourhood')['number_of_reviews'].sum().sort_values(ascending=False).head(15)
plt.figure(figsize=(12,6))
sns.barplot(x=top_reviews.values, y=top_reviews.index)
plt.title('Top 15 Locations with Most Reviews')
plt.xlabel('Total Reviews')
plt.tight_layout()
plt.show()

## 8. Pricing Strategy Analysis

In [None]:
avg_price_ng = df.groupby('neighbourhood_group')['price'].mean()
plt.figure(figsize=(8,6))
sns.barplot(x=avg_price_ng.index, y=avg_price_ng.values)
plt.title('Average Price by Neighbourhood Group')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

avg_price_rt = df.groupby('room_type')['price'].mean()
plt.figure(figsize=(8,6))
sns.barplot(x=avg_price_rt.index, y=avg_price_rt.values)
plt.title('Average Price by Room Type')
plt.tight_layout()
plt.show()

## 9. Availability vs Price

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x='availability_365', y='price', alpha=0.5)
plt.title('Relationship Between Availability and Price')
plt.tight_layout()
plt.show()

## 10. Market Saturation

In [None]:
listings = df['neighbourhood_group'].value_counts()
plt.figure(figsize=(8,6))
sns.barplot(x=listings.index, y=listings.values)
plt.title('Listings per Neighbourhood Group')
plt.tight_layout()
plt.show()

## 11. Price Category Distribution

In [None]:
df['category'] = pd.cut(df['price'], bins=[0,100,300,df['price'].max()],
                     labels=['Budget','Mid-range','Luxury'])

category_counts = df['category'].value_counts()

plt.figure(figsize=(8,6))
sns.barplot(x=category_counts.index, y=category_counts.values)
plt.title('Distribution of Listings by Price Category')
plt.tight_layout()
plt.show()

## 12. Revenue Potential Analysis

In [None]:
df['total_revenue'] = df['price'] * df['minimum_nights']
top_rev = df.groupby('neighbourhood')['total_revenue'].mean().sort_values(ascending=False).head(15)

plt.figure(figsize=(12,6))
sns.barplot(x=top_rev.values, y=top_rev.index)
plt.title('Top 15 Neighbourhoods by Revenue Potential')
plt.tight_layout()
plt.show()