In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
import pandas as pd

df = pd.read_csv("AB_NYC_2019.csv")
df.head()


In [None]:
df.head()
df.shape
df.info()
df.describe()
df.isnull().sum()






In [None]:
df['name'].fillna('Unknown', inplace=True)
df['host_name'].fillna('Unknown', inplace=True)

df['reviews_per_month'].fillna(0, inplace=True)

df['last_review'] = pd.to_datetime(df['last_review'])


df = df[(df['price'] > 0) & (df['price'] <= 10000)]

df = df[df['minimum_nights'] <= 365]

df_cleaned = df.drop(['id', 'host_id'], axis=1)

print("After cleaning:")
print(df_cleaned.info())
print("\nMissing values after cleaning:")
print(df_cleaned.isnull().sum())

df_cleaned.head()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,5))
sns.histplot(df_cleaned['price'], bins=50, kde=True)
plt.title("Distribution of Airbnb Prices in NYC")
plt.xlabel("Price ($)")
plt.ylabel("Number of Listings")
plt.show()

plt.figure(figsize=(8,5))
sns.boxplot(x=df_cleaned['price'])
plt.title("Boxplot of Airbnb Prices")
plt.xlabel("Price ($)")
plt.show()

plt.figure(figsize=(8,5))
sns.countplot(x='neighbourhood_group', data=df_cleaned)
plt.title("Number of Listings by Borough")
plt.xlabel("Borough")
plt.ylabel("Number of Listings")
plt.show()

plt.figure(figsize=(8,5))
sns.barplot(x='room_type', y='price', data=df_cleaned)
plt.title("Average Price per Room Type")
plt.xlabel("Room Type")
plt.ylabel("Average Price ($)")
plt.show()

plt.figure(figsize=(10,5))
sns.scatterplot(x='number_of_reviews', y='price', data=df_cleaned)
plt.title("Number of Reviews vs Price")
plt.xlabel("Number of Reviews")
plt.ylabel("Price ($)")
plt.show()

numeric_cols = df_cleaned.select_dtypes(include=['int64', 'float64'])
plt.figure(figsize=(10,8))
sns.heatmap(numeric_cols.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Heatmap of Numeric Features")
plt.show()



In [None]:
!pip install folium


In [None]:
room_colors = {
    'Entire home/apt': 'red',
    'Private room': 'green',
    'Shared room': 'blue'
}

nyc_map2 = folium.Map(location=[40.7128, -74.0060], zoom_start=11)

for idx, row in df_cleaned.iterrows():
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=3,
        color=room_colors[row['room_type']],
        fill=True,
        fill_color=room_colors[row['room_type']],
        fill_opacity=0.5,
        popup=f"Price: ${row['price']}\nRoom Type: {row['room_type']}\nNeighborhood: {row['neighbourhood']}"
    ).add_to(nyc_map2)

nyc_map2


In [None]:
top_expensive_per_borough = df_cleaned.groupby('neighbourhood_group').apply(
    lambda x: x.nlargest(10, 'price')
).reset_index(drop=True)

top_expensive_per_borough[['name', 'neighbourhood_group', 'neighbourhood', 'room_type', 'price']]


In [None]:
top_hosts = df_cleaned.groupby('host_name')['calculated_host_listings_count'].sum().sort_values(ascending=False).head(10)
top_hosts


In [None]:
top_neighborhoods = df_cleaned.groupby('neighbourhood')['price'].mean().sort_values(ascending=False).head(10)
top_neighborhoods

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,6))
sns.barplot(x=top_neighborhoods.values, y=top_neighborhoods.index, palette="viridis")
plt.title("Top 10 Most Expensive Neighborhoods by Average Price")
plt.xlabel("Average Price ($)")
plt.ylabel("Neighborhood")
plt.show()



In [None]:
df_time = df_cleaned.dropna(subset=['last_review'])

df_time['month_year'] = df_time['last_review'].dt.to_period('M')

reviews_over_time = df_time.groupby('month_year')['reviews_per_month'].sum()

plt.figure(figsize=(12,6))
reviews_over_time.plot()
plt.title("Total Reviews per Month Over Time")
plt.xlabel("Month-Year")
plt.ylabel("Total Reviews per Month")
plt.xticks(rotation=45)
plt.show()


In [None]:
avg_reviews_borough = df_time.groupby(['month_year', 'neighbourhood_group'])['reviews_per_month'].mean().unstack()

plt.figure(figsize=(12,6))
avg_reviews_borough.plot()
plt.title("Average Reviews per Month by Borough Over Time")
plt.xlabel("Month-Year")
plt.ylabel("Average Reviews per Month")
plt.xticks(rotation=45)
plt.legend(title="Borough")
plt.show()


## **NYC Airbnb Analysis**

## Project Overview
This project analyzes **Airbnb listings in New York City (~48,870 listings)** to explore pricing patterns, neighborhood trends, room types, host behavior, and review activity. The goal is to provide actionable insights using data analysis and visualizations.

## Data Cleaning & Preprocessing
- Filled missing values in `name` and `host_name` with 'Unknown'.  
- Filled missing `reviews_per_month` with 0.  
- Converted `last_review` to datetime.  
- Removed extreme values: `price > 10,000`, `minimum_nights > 365`.  
- Dropped unnecessary columns: `id`, `host_id`.  

## Exploratory Data Analysis
- **Price Distribution:** Most listings are moderately priced ($50â€“$500) with some high-value outliers.  
- **Listings by Borough:** Manhattan & Brooklyn dominate in number of listings.  
- **Room Type Analysis:** Entire homes/apartments are generally more expensive than private/shared rooms.  
- **Reviews vs Price:** Most expensive listings have fewer reviews.  
- **Correlation:** `reviews_per_month` strongly correlates with `number_of_reviews`.  

## Optional Enhancements
- **Geographical Visualization:** Interactive NYC map showing listing locations and room types.


- **Top-N Analysis:**  
  - Top 10 most expensive listings per borough.  
  - Top hosts by number of listings.  
  - Top 10 most expensive neighborhoods by average price.

- **Time-Based Trends:**  
  - Total reviews per month over time.  
  - Average reviews per month by borough.  

## Key Insights
- Airbnb NYC listings have a **skewed price distribution** with luxury outliers.  
- Entire homes/apartments are more expensive than private/shared rooms.  
- Manhattan & Brooklyn dominate in listings and reviews.  
- Reviews per month correlate strongly with number of reviews.  
- Premium listings and neighborhoods are clearly identifiable.  
- Seasonal trends are visible from review patterns.  

## Recommendations
- Adjust pricing seasonally based on review activity.  
- Explore premium neighborhoods for investment opportunities.  
- Build predictive pricing models using room type, location, and availability.  


## Project Deliverables
- Raw and cleaned dataset.  
- Python notebook with full analysis.  
- Visualizations and optional enhancements.  
- Professional markdown report (this document).  

**End of Report**
