# 02 — Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load Cleaned Data

In [None]:
df = pd.read_csv('../data/processed/otodom_cleaned.csv')
print(f'Shape: {df.shape}')
df.head()

## Price Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.histplot(df['price'], bins=50, kde=True, ax=axes[0])
axes[0].set_title('Price Distribution')
axes[0].set_xlabel('Price (PLN)')

sns.histplot(df['price_per_m2'], bins=50, kde=True, ax=axes[1])
axes[1].set_title('Price per m² Distribution')
axes[1].set_xlabel('Price per m² (PLN)')

plt.tight_layout()
plt.savefig('../results/figures/price_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## Price by Voivodeship

In [None]:
plt.figure(figsize=(14, 6))
order = df.groupby('voivodeship')['price_per_m2'].median().sort_values(ascending=False).index
sns.boxplot(data=df, x='voivodeship', y='price_per_m2', order=order)
plt.xticks(rotation=45, ha='right')
plt.title('Price per m² by Voivodeship')
plt.xlabel('Voivodeship')
plt.ylabel('Price per m² (PLN)')
plt.tight_layout()
plt.savefig('../results/figures/price_by_voivodeship.png', dpi=150, bbox_inches='tight')
plt.show()

## Price by Number of Rooms

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='no_of_rooms', y='price')
plt.title('Price by Number of Rooms')
plt.xlabel('Number of Rooms')
plt.ylabel('Price (PLN)')
plt.tight_layout()
plt.savefig('../results/figures/price_by_rooms.png', dpi=150, bbox_inches='tight')
plt.show()

## Surface vs Price

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='surface', y='price', alpha=0.3)
plt.title('Surface vs Price')
plt.xlabel('Surface (m²)')
plt.ylabel('Price (PLN)')
plt.tight_layout()
plt.savefig('../results/figures/surface_vs_price.png', dpi=150, bbox_inches='tight')
plt.show()

## Market Type Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

df['market'].value_counts().plot.pie(autopct='%1.1f%%', ax=axes[0])
axes[0].set_title('Market Type')
axes[0].set_ylabel('')

df['advertiser_type'].value_counts().plot.pie(autopct='%1.1f%%', ax=axes[1])
axes[1].set_title('Advertiser Type')
axes[1].set_ylabel('')

plt.tight_layout()
plt.savefig('../results/figures/market_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## Correlation Heatmap

In [None]:
numeric_cols = df.select_dtypes(include=[np.number])
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_cols.corr(), annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.savefig('../results/figures/correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

## Top 10 Cities by Listings

In [None]:
plt.figure(figsize=(12, 6))
top_cities = df['city'].value_counts().head(10)
sns.barplot(x=top_cities.values, y=top_cities.index)
plt.title('Top 10 Cities by Number of Listings')
plt.xlabel('Number of Listings')
plt.ylabel('City')
plt.tight_layout()
plt.savefig('../results/figures/top_cities.png', dpi=150, bbox_inches='tight')
plt.show()

## Summary Statistics by Market Type

In [None]:
df.groupby('market')[['price', 'surface', 'price_per_m2']].agg(['mean', 'median', 'count'])