In [None]:
import pandas as pd
import numpy as np
import os
import geoio

In [None]:
BASE_DIR1 = '/media/sandesh/DATA/ENGINEERING/5th SEM/Minor Project/Mali'
BASE_DIR = '/home/sandesh/Documents/Minor Project/predicting-poverty-replication'
NIGHTLIGHTS_DIRS = [os.path.join(BASE_DIR,'data/nightlights', 'viirs_2015_75N060W.tif')]

# COUNTRIES_DIR = os.path.join(BASE_DIR1, 'nigeria_2015')

In [None]:
import sys
sys.path.append(BASE_DIR)
from utils import create_space

In [None]:
def process_mali():
#     lsms_dir = os.path.join(COUNTRIES_DIR, 'nigeria_2015', 'LSMS')
    consumption_file = 'eaci2015_agregatconso.csv'
    consumption_pc_col = 'pcexp' # per capita
    hhsize_col = 'hhsize' # people in household

    geovariables_file = 'eaci_geovariables_2015.csv'
    lat_col = 'lat_dd_mod'
    lon_col = 'lon_dd_mod'

    # purchasing power parity for nigeria in 2015 (https://data.worldbank.org/indicator/PA.NUS.PRVT.PP?locations=NG)
    ppp = 214.013
    
#     for file in [consumption_file, geovariables_file]:
#         assert os.path.isfile(os.path.join(lsms_dir, file)), print(f'Could not find {file}')
    
    df = pd.read_csv(os.path.join(BASE_DIR1, consumption_file))
    df['cons_ph'] = df[consumption_pc_col] * df[hhsize_col]
    df['pph'] = df[hhsize_col]
    df['cons_ph'] = df['cons_ph'] / ppp / 365
    df = df[['grappe', 'cons_ph', 'pph']]

    df_geo = pd.read_csv(os.path.join(BASE_DIR1, geovariables_file))
    df_cords = df_geo[['grappe', lat_col, lon_col]]
    df_cords.rename(columns={lat_col: 'cluster_lat', lon_col: 'cluster_lon'}, inplace=True)
    df_combined = pd.merge(df, df_cords, on='grappe')
    df_combined.drop(['grappe'], axis=1, inplace=True)
    df_combined.dropna(inplace=True) # can't use na values
    
    df_clusters = df_combined.groupby(['cluster_lat', 'cluster_lon']).sum().reset_index()
    df_clusters['cons_pc'] = df_clusters['cons_ph'] / df_clusters['pph'] # divides total cluster income by people
    df_clusters['country'] = 'mli'
    return df_clusters[['country', 'cluster_lat', 'cluster_lon', 'cons_pc']]

In [None]:
df_mli = process_mali()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [None]:
df_mli.shape

(824, 4)

In [None]:
tifs = [geoio.GeoImage(ndir) for ndir in NIGHTLIGHTS_DIRS]
print(tifs)

[Class Name        : GeoImage
Driver Name       : GTiff
Data Type         : Float32
File Name         : /home/sandesh/Documents/Minor Project/predicting-poverty-
                    replication/data/nightlights/viirs_2015_75N060W.tif
File List         : ['/home/sandesh/Documents/Minor Project/predicting-poverty-
                    replication/data/nightlights/viirs_2015_75N060W.tif']
Dimensions        : (1, 28800, 18000) (nlayers, nrows, ncols)
Resolution        : (0.0041666667, 0.0041666667) (x,y)
Extent            : (-60.00208333335, 75.00208333335, 59.99791762665001,
                    0.0020827333499937595) (ul_x, ul_y, lr_x, lr_y)
Projection String : GEOGCS["WGS 84",
                     DATUM["WGS_1984",
                         SPHEROID["WGS 84",6378137,298.257223563,
                             AUTHORITY["EPSG","7030"]],
                         AUTHORITY["EPSG","6326"]],
                     PRIMEM["Greenwich",0],
                     UNIT["degree",0.0174532925199433,
     

In [None]:
tif_array = np.squeeze(tifs[0].get_data())
tif_array.shape[1]

28800

In [None]:
def add_nightlights(df, tif, tif_array):
    cluster_nightlights = []
    for i,r in df.iterrows():
        min_lat, min_lon, max_lat, max_lon = create_space(r.cluster_lat, r.cluster_lon)
        
        xminPixel, ymaxPixel = tif.proj_to_raster(min_lon, min_lat)
        xmaxPixel, yminPixel = tif.proj_to_raster(max_lon, max_lat)
        assert xminPixel < xmaxPixel, print(r.cluster_lat, r.cluster_lon)
        assert yminPixel < ymaxPixel, print(r.cluster_lat, r.cluster_lon)
        if xminPixel < 0 or xmaxPixel >= tif_array.shape[1]:
            print(f"no match for {r.cluster_lat}, {r.cluster_lon}")
            raise ValueError()
        elif yminPixel < 0 or ymaxPixel >= tif_array.shape[0]:
            print(f"no match for {r.cluster_lat}, {r.cluster_lon}")
            raise ValueError()
        xminPixel, yminPixel, xmaxPixel, ymaxPixel = int(xminPixel), int(yminPixel), int(xmaxPixel), int(ymaxPixel)
        cluster_nightlights.append(tif_array[yminPixel:ymaxPixel,xminPixel:xmaxPixel].mean())
        
    df['nightlights'] = cluster_nightlights

In [None]:
add_nightlights(df_mli, tifs[0], tif_array)

In [None]:
df_mli.head(10)

Unnamed: 0,country,cluster_lat,cluster_lon,cons_pc,nightlights
0,mli,10.315847,-6.105348,2.159553,0.0
1,mli,10.40465,-5.891725,0.778903,0.0
2,mli,10.466931,-7.938471,2.292946,0.0
3,mli,10.513494,-5.978286,3.250503,0.0
4,mli,10.535882,-8.152684,3.772908,0.0
5,mli,10.550003,-5.757343,5.465028,0.130248
6,mli,10.551897,-6.905822,7.613439,0.0
7,mli,10.55698,-5.757505,4.761089,0.13645
8,mli,10.604713,-6.00856,2.648353,0.0
9,mli,10.605035,-5.768494,3.407347,0.035758


In [None]:
df_mli.to_csv(os.path.join(BASE_DIR1,'processed/clusters.csv'), index=False)