In [1]:
# Chunk 0 — Setup (run once)
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

# === Paths (edit if needed) ===
DATA = Path("D:/LightPollutionProject/data")   # project data root
PROC = DATA / "processed"
PLOTS = Path("D:/LightPollutionProject/plots") # separate plots folder

# ensure dirs exist
PROC.mkdir(parents=True, exist_ok=True)
PLOTS.mkdir(parents=True, exist_ok=True)

# simple save-figure helper
def _savefig(fig, path, dpi=150):
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(path, dpi=dpi, bbox_inches="tight")
    plt.close(fig)
    print(f"Saved plot: {path}")


In [2]:
# Chunk 1 — Load data
# Prefer processed snapshot
processed_snap = PROC / "asia_nightlights_processed.csv"
yearly_path = PROC / "yearly.csv"
asia_year_path = PROC / "asia_year.csv"

if processed_snap.exists():
    df = pd.read_csv(processed_snap)
    print("Loaded processed snapshot:", processed_snap)
elif (DATA / "raw" / "asia_nightlight_sample.csv").exists():
    df = pd.read_csv(DATA / "raw" / "asia_nightlight_sample.csv")
    print("Loaded sample raw:", DATA / "raw" / "asia_nightlight_sample.csv")
else:
    raise FileNotFoundError("No processed snapshot or sample found. Put sample in data/raw/ or run preprocessing.")

# quick checks
print("df shape:", df.shape)
print("columns:", list(df.columns))
display(df.head())
# If you also want to load aggregated yearly csvs (if available)
if asia_year_path.exists():
    asia_year = pd.read_csv(asia_year_path)
    print("Loaded asia_year.csv:", asia_year_path)
    display(asia_year.head())
else:
    print("asia_year.csv not found at:", asia_year_path)


Loaded processed snapshot: D:\LightPollutionProject\data\processed\asia_nightlights_processed.csv
df shape: (200, 12)
columns: ['iso', 'id_1', 'name_1', 'id_2', 'name_2', 'year', 'month', 'nlsum', 'area', 'date', 'nl_density', 'nl_density_clipped']


Unnamed: 0,iso,id_1,name_1,id_2,name_2,year,month,nlsum,area,date,nl_density,nl_density_clipped
0,AFG,1,Badakhshan,1,Baharak,2013,1,4403.26,3003.881,2013-01-01,1.465857,1.465857
1,AFG,1,Badakhshan,1,Baharak,2013,2,2961.64,3003.881,2013-02-01,0.985938,0.985938
2,AFG,1,Badakhshan,1,Baharak,2013,3,5295.34,3003.881,2013-03-01,1.762833,1.762833
3,AFG,1,Badakhshan,1,Baharak,2013,4,3081.37,3003.881,2013-04-01,1.025796,1.025796
4,AFG,1,Badakhshan,1,Baharak,2013,5,2535.59,3003.881,2013-05-01,0.844105,0.844105


Loaded asia_year.csv: D:\LightPollutionProject\data\processed\asia_year.csv


Unnamed: 0,year,asia_avg_density
0,2013,1.065663
1,2014,1.15452
2,2015,0.956228
3,2016,0.791901
4,2017,1.894944


In [3]:
# Chunk 2 — Country-level trends
# Build country-year series (uses nl_density_clipped if present)
value_col = "nl_density_clipped" if "nl_density_clipped" in df.columns else ("nl_density" if "nl_density" in df.columns else "nlsum")

country_year = (
    df.groupby(["iso", "year"], as_index=False)[value_col]
      .mean()
      .rename(columns={value_col: "avg_density"})
)

# Save CSV
country_year.to_csv(PROC / "country_year.csv", index=False)
print("Saved:", PROC / "country_year.csv")

# pick top countries by long-run mean
country_rank = country_year.groupby("iso", as_index=False)["avg_density"].mean().sort_values("avg_density", ascending=False)
top_countries = country_rank.head(6)["iso"].tolist()
print("Top countries by avg density:", top_countries)

# Plot trends for these top countries
fig, ax = plt.subplots(figsize=(10,6))
for iso in top_countries:
    sub = country_year[country_year["iso"] == iso].sort_values("year")
    ax.plot(sub["year"], sub["avg_density"], marker="o", label=iso)
ax.set_xlabel("Year")
ax.set_ylabel("Avg nightlight density")
ax.set_title("Top countries — yearly average nightlight density")
ax.legend()
_savefig(fig, PLOTS / "country_top_trends.png")


Saved: D:\LightPollutionProject\data\processed\country_year.csv
Top countries by avg density: ['AFG']
Saved plot: D:\LightPollutionProject\plots\country_top_trends.png


In [4]:
# Chunk 3 — Top polluted regions (districts)
if "name_2" in df.columns:
    district_rank = (
        df.groupby(["iso", "name_2"], as_index=False)[value_col]
          .mean()
          .rename(columns={value_col: "avg_density"})
          .sort_values("avg_density", ascending=False)
    )
    district_rank.to_csv(PROC / "district_rank_full.csv", index=False)
    print("Saved:", PROC / "district_rank_full.csv")

    top_n = 15
    top_d = district_rank.head(top_n).iloc[::-1]  # reverse for plotting
    fig, ax = plt.subplots(figsize=(10,8))
    labels = top_d["name_2"].astype(str) + " (" + top_d["iso"] + ")"
    ax.barh(labels, top_d["avg_density"])
    ax.set_xlabel("Avg nightlight density")
    ax.set_title(f"Top {top_n} districts by average density")
    _savefig(fig, PLOTS / "top_districts.png")
else:
    print("Column 'name_2' not present in df — cannot compute district ranks.")


Saved: D:\LightPollutionProject\data\processed\district_rank_full.csv
Saved plot: D:\LightPollutionProject\plots\top_districts.png


In [5]:
# Chunk 4 — Correlation heatmap
cols = [c for c in ["nlsum", "area", "nl_density", "nl_density_clipped"] if c in df.columns]
if len(cols) >= 2:
    corr = df[cols].corr(numeric_only=True)
    corr.to_csv(PROC / "correlation_matrix.csv")
    print("Saved correlation matrix:", PROC / "correlation_matrix.csv")

    fig, ax = plt.subplots(figsize=(6,5))
    im = ax.imshow(corr.values, aspect="auto")
    ax.set_xticks(range(len(cols))); ax.set_xticklabels(cols, rotation=45, ha="right")
    ax.set_yticks(range(len(cols))); ax.set_yticklabels(cols)
    fig.colorbar(im, ax=ax)
    ax.set_title("Correlation heatmap (numeric features)")
    _savefig(fig, PLOTS / "correlation_heatmap.png")
else:
    print("Not enough numeric columns for correlation heatmap. Found:", cols)


Saved correlation matrix: D:\LightPollutionProject\data\processed\correlation_matrix.csv
Saved plot: D:\LightPollutionProject\plots\correlation_heatmap.png


In [6]:
# Chunk 5 — Results summary text (Top 5 countries + districts)
summary_file = PROC / "results_summary.txt"

lines = []

# Top 5 countries
top5_countries = (
    country_year.groupby("iso", as_index=False)["avg_density"]
    .mean()
    .sort_values("avg_density", ascending=False)
    .head(5)
)
lines.append("Top 5 countries by avg nightlight density:\n")
lines.extend([f"{i+1}. {row.iso}: {row.avg_density:.2f}" for i, row in top5_countries.iterrows()])
lines.append("\n")

# Top 5 districts
if "name_2" in df.columns:
    top5_districts = (
        df.groupby(["iso", "name_2"], as_index=False)[value_col]
        .mean()
        .rename(columns={value_col: "avg_density"})
        .sort_values("avg_density", ascending=False)
        .head(5)
    )
    lines.append("Top 5 districts by avg nightlight density:\n")
    lines.extend([f"{i+1}. {row.name_2} ({row.iso}): {row.avg_density:.2f}" for i, row in top5_districts.iterrows()])

# Write to file
with open(summary_file, "w") as f:
    f.write("\n".join(lines))

print(f"Saved summary: {summary_file}")


Saved summary: D:\LightPollutionProject\data\processed\results_summary.txt


In [7]:
# Chunk 6 — India-specific trends
india = df[df["iso"] == "IND"].copy()
if not india.empty:
    india_year = (
        india.groupby("year", as_index=False)[value_col]
        .mean()
        .rename(columns={value_col: "avg_density"})
    )
    india_year.to_csv(PROC / "india_year.csv", index=False)
    print("Saved:", PROC / "india_year.csv")

    fig, ax = plt.subplots(figsize=(8,5))
    ax.plot(india_year["year"], india_year["avg_density"], marker="o", color="darkgreen")
    ax.set_title("India — yearly average nightlight density")
    ax.set_xlabel("Year"); ax.set_ylabel("Avg density")
    _savefig(fig, PLOTS / "india_trend.png")
else:
    print("No rows for iso=IND in df")


No rows for iso=IND in df
