In [1]:
# read data
import pandas as pd
from glob import glob

base_dir = "./data/poverty/processed"
features = []
feature_files = glob(base_dir + "/features/features*")
coordinates = pd.read_csv(base_dir + "/features/coordinates.csv")
base_data = pd.read_csv(base_dir + "/base_data.csv")

In [2]:
# clean data
filtered_data = base_data[pd.notnull(base_data.D)]

In [3]:
filtered_data.head(2)

Unnamed: 0,shrid2,state_name,district_name,subdistrict_name,clusters,wave,D,Ycons_raw,urban,tot_p,tot_f,Ylowinc_raw,Ymidinc_raw,Ylowinc,Ymidinc,Ycons
380,11-28-532-04316-569366,andhra pradesh,adilabad,kouthala,KOUTHALA ADILABAD,Control,0.0,20533.59375,0,746.0,0.491957,0.0,0.0,1,1,0
381,11-28-532-04316-569367,andhra pradesh,adilabad,kouthala,KOUTHALA ADILABAD,Control,0.0,17000.207031,0,1209.0,0.479735,0.174468,0.0,0,1,1


#### data interpretation
- shrid2: id
- D: treatment
- Ylowinc, Ymidinc, Ycons: outcome, prefer Ycons
- urban, tot_p, tot_f: covariates, urban is all 0

In [4]:
features = pd.concat([pd.read_csv(f) for f in feature_files])
all_data = pd.merge(filtered_data, features, how="left", on="shrid2")

In [10]:
coords = all_data[["centroid_lat", "centroid_lon"]].to_numpy()

In [5]:
to_save_covariates = all_data[["urban", "tot_p", "tot_f"]]
to_save_covariates["treatment"] = all_data["D"]
to_save_covariates["outcome"] = all_data["Ycons"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_save_covariates["treatment"] = all_data["D"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_save_covariates["outcome"] = all_data["Ycons"]


In [6]:
with open("full_dataset.csv", 'w') as f:
    f.write(to_save_covariates.to_csv())

In [49]:
import os
import numpy as np
from tqdm import tqdm
import torch

if not os.path.exists("features"):
    os.mkdir("features")

for i in tqdm(range(len(all_data))):
    feat = []
    row = all_data.iloc[i]
    for j in range(1, 4001):
        feat.append(row[f"feature_{j}"])
    feat = np.stack(feat)
    feat = torch.from_numpy(feat)
    torch.save(feat, f"features/{i}.pt")

100%|██████████| 6055/6055 [00:40<00:00, 149.71it/s]


make and save graph

In [17]:
import math
from typing import List, Tuple, Dict, Set

Point = Tuple[float, float]  # (lat, lon) in degrees

def haversine_km(a: Point, b: Point) -> float:
    """Great-circle distance between two lat/lon points (km)."""
    lat1, lon1, lat2, lon2 = map(math.radians, [a[0], a[1], b[0], b[1]])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    h = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    return 2 * 6371.0088 * math.asin(math.sqrt(h))  # Earth radius (km)

def build_threshold_graph(points: List[Point], max_km: float) -> Dict[int, Set[int]]:
    """Adjacency list where nodes i,j connect if distance <= max_km."""
    n = len(points)
    adj: Dict[int, Set[int]] = {i: set() for i in range(n)}
    for i in range(n):
        for j in range(i + 1, n):
            if haversine_km(points[i], points[j]) <= max_km:
                adj[i].add(j)
                adj[j].add(i)
    return adj

# Example
graph = build_threshold_graph(coords, max_km=5)
print(graph)  # adjacency for each node index

{0: {1, 2, 11}, 1: {0, 11, 2, 3}, 2: {0, 1, 3, 5, 11}, 3: {1, 2, 4, 5, 10, 11, 13}, 4: {10, 3, 5, 6}, 5: {2, 3, 4, 6, 10, 13}, 6: {10, 4, 5, 7}, 7: {8, 9, 6}, 8: {7, 9, 16, 25, 26}, 9: {7, 8, 10, 13, 14, 15, 16}, 10: {3, 4, 5, 6, 9, 13}, 11: {0, 1, 2, 3, 12}, 12: {19, 11, 13}, 13: {3, 5, 9, 10, 12, 14, 15, 19}, 14: {9, 13, 15, 16, 17, 18, 19, 21, 24}, 15: {9, 13, 14, 16, 17, 18, 24}, 16: {8, 9, 14, 15, 17, 24, 25}, 17: {14, 15, 16, 18, 21, 23, 24, 25}, 18: {14, 15, 17, 19, 20, 21, 22, 24}, 19: {12, 13, 14, 18, 20}, 20: {18, 19, 21}, 21: {14, 17, 18, 20, 22, 23, 24}, 22: {18, 21, 23, 24, 28, 29}, 23: {17, 21, 22, 24, 25, 29, 30, 31}, 24: {14, 15, 16, 17, 18, 21, 22, 23, 25}, 25: {8, 16, 17, 23, 24, 26, 27, 31}, 26: {8, 25, 27}, 27: {25, 26}, 28: {22}, 29: {30, 22, 23}, 30: {42, 43, 23, 29, 31}, 31: {32, 42, 23, 25, 30}, 32: {31}, 33: {34}, 34: {33, 35}, 35: {40, 34, 36, 37}, 36: {40, 35, 37, 38}, 37: {35, 36, 38, 39}, 38: {36, 37, 62, 39}, 39: {37, 38, 46, 47, 61, 62}, 40: {41, 35, 36},

In [18]:
with open("graph_adj.txt", 'w') as f:
    for k, vs in graph.items():
        f.write("{} {}\n".format(k, " ".join([str(x) for x in vs])))