In [1]:
### EDA for combined_data.geojson

import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [None]:

# Load the dataset
gdf = gpd.read_file('../../data/processed/combined_data.geojson')

# Basic info
print("\n--- Basic Information ---")
print(gdf.info())

# Preview data
print("\n--- Preview Data ---")
print(gdf.head())



In [None]:
# Shape
print(f"\nDataset contains {gdf.shape[0]} rows and {gdf.shape[1]} columns.")

# Columns
print("\n--- Column Names ---")
print(list(gdf.columns))

# Missing values
print("\n--- Missing Values ---")
print(gdf.isnull().sum())

# Data types
print("\n--- Data Types ---")
print(gdf.dtypes)



In [None]:
# Summary statistics (numerical)
print("\n--- Summary Statistics (Numerical Columns) ---")
print(gdf.describe())



In [None]:
# Summary statistics (categorical)
categorical_cols = gdf.select_dtypes(include=['object']).columns
print("\n--- Summary Statistics (Categorical Columns) ---")
print(gdf[categorical_cols].describe())



In [None]:
# Histograms for numerical features
numerical_cols = gdf.select_dtypes(include=[np.number]).columns
print("\n--- Plotting Histograms ---")
for col in numerical_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(gdf[col], kde=True, bins=30)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()



In [None]:
# Boxplots for numerical features
print("\n--- Plotting Boxplots ---")
for col in numerical_cols:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=gdf[col])
    plt.title(f'Boxplot of {col}')
    plt.tight_layout()
    plt.show()



In [None]:
# Correlation heatmap
print("\n--- Correlation Heatmap ---")
plt.figure(figsize=(12, 10))
corr = gdf[numerical_cols].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()



In [None]:
# Unique values in categorical columns
print("\n--- Unique Values in Categorical Columns ---")
for col in categorical_cols:
    print(f"{col}: {gdf[col].nunique()} unique values")
    print(gdf[col].unique()[:10])  # showing only first 10 unique values



In [None]:
# Plotting the GeoData
print("\n--- Plotting Geospatial Data ---")
fig, ax = plt.subplots(figsize=(10, 10))
gdf.plot(ax=ax, edgecolor='black', column=numerical_cols[0] if len(numerical_cols) > 0 else None, cmap='viridis', legend=True)
plt.title('Geospatial Plot of Data')
plt.show()



In [None]:
# Spatial basic stats
print("\n--- CRS (Coordinate Reference System) ---")
print(gdf.crs)

print("\n--- Bounds ---")
print(gdf.bounds)

print("\n--- Geometry Types ---")
print(gdf.geom_type.value_counts())

### Done!
