First I load the relevant Python libraries, and load the DHS and CDR data

In [49]:
import pandas as pd
import numpy as np
from scipy.stats.stats import pearsonr
import matplotlib.pyplot as plt

path = "/Users/JackShipway/Desktop/UCLProject"

# DHS data
dhs = pd.DataFrame(pd.read_csv('DHSData.csv'))

# CDR data - Activity
activity_adm_1 = pd.DataFrame(pd.read_csv(path+"/CorrelationAnalysis/activity_1.csv"))
activity_adm_2 = pd.DataFrame(pd.read_csv(path+"/CorrelationAnalysis/activity_2.csv"))
activity_adm_3 = pd.DataFrame(pd.read_csv(path+"/CorrelationAnalysis/activity_3.csv"))
activity_adm_4 = pd.DataFrame(pd.read_csv(path+"/CorrelationAnalysis/activity_4.csv"))

# CDR data - Entropy
entropy_adm_1 = pd.DataFrame(pd.read_csv(path+"/Project1-Health/entropy_adm_1.csv"))
entropy_adm_2 = pd.DataFrame(pd.read_csv(path+"/Project1-Health/entropy_adm_2.csv"))
entropy_adm_3 = pd.DataFrame(pd.read_csv(path+"/Project1-Health/entropy_adm_3.csv"))
entropy_adm_4 = pd.DataFrame(pd.read_csv(path+"/Project1-Health/entropy_adm_4.csv"))

Now I extract the relevant Malaria metrics from the DHS data, and take the mean over each administrative region

In [50]:
malaria_adm_1 = dhs.groupby('Adm_1')['MalariaPerPop'].mean().reset_index()
malaria_adm_2 = dhs.groupby('Adm_2')['MalariaPerPop'].mean().reset_index()
malaria_adm_3 = dhs.groupby('Adm_3')['MalariaPerPop'].mean().reset_index()
malaria_adm_4 = dhs.groupby('Adm_4')['MalariaPerPop'].mean().reset_index()

For levels 3 and 4, there are missing data for the corresponding DHS values, so we remove the corresponding CDR data

In [51]:
for i in np.setdiff1d(activity_adm_3['Adm_3'], dhs.groupby('Adm_3')['MalariaPerPop'].mean().reset_index()['Adm_3']):
    activity_adm_3 = activity_adm_3[activity_adm_3['Adm_3'] != i]
    entropy_adm_3 = entropy_adm_3[entropy_adm_3['Adm_3'] != i]
    
for i in np.setdiff1d(activity_adm_4['Adm_4'], dhs.groupby('Adm_4')['MalariaPerPop'].mean().reset_index()['Adm_4']):
    activity_adm_4 = activity_adm_4[activity_adm_4['Adm_4'] != i]
    entropy_adm_4 = entropy_adm_4[entropy_adm_4['Adm_4'] != i]

for i in np.setdiff1d(dhs.groupby('Adm_4')['MalariaPerPop'].mean().reset_index()['Adm_4'], activity_adm_4['Adm_4']):
    malaria_adm_4 = malaria_adm_4[malaria_adm_4['Adm_4'] != i]

For each administrative level, first take a look at the scatter plot to determine outliers

In [84]:
plt.scatter(np.array(activity_adm_1['Vol']), np.array(malaria_adm_1['MalariaPerPop']))
plt.show()
plt.scatter(np.array(activity_adm_2['Vol']), np.array(malaria_adm_2['MalariaPerPop']))
plt.show()
plt.scatter(np.array(activity_adm_3['Vol']), np.array(malaria_adm_3['MalariaPerPop']))
plt.show()
plt.scatter(np.array(activity_adm_4['Vol']), np.array(malaria_adm_4['MalariaPerPop']))
plt.show()

plt.scatter(np.array(entropy_adm_1['Entropy']), np.array(malaria_adm_1['MalariaPerPop']))
plt.show()
plt.scatter(np.array(entropy_adm_2['Entropy']), np.array(malaria_adm_2['MalariaPerPop']))
plt.show()
plt.scatter(np.array(entropy_adm_3['Entropy']), np.array(malaria_adm_3['MalariaPerPop']))
plt.show()
plt.scatter(np.array(entropy_adm_4['Entropy']), np.array(malaria_adm_4['MalariaPerPop']))
plt.show()

Remove outliers, and re-plot

In [91]:
# Entropy - Adm_1
entropy_outliers_1 = np.intersect1d(np.where(entropy_adm_1['Entropy'] > 150000), np.where(entropy_adm_1['Entropy'] < 1500000))
plt.scatter(np.array(entropy_adm_1['Entropy']), np.array(malaria_adm_1['MalariaPerPop']), c='r')
plt.scatter(np.array(entropy_adm_1['Entropy'])[entropy_outliers_1], np.array(malaria_adm_1['MalariaPerPop'])[entropy_outliers_1], c='b')
plt.show()
plt.scatter(np.array(entropy_adm_1['Entropy'])[entropy_outliers_1], np.array(malaria_adm_1['MalariaPerPop'])[entropy_outliers_1])
plt.show()

# Entropy - Adm_2
entropy_outliers_2 = np.intersect1d(np.where(entropy_adm_2['Entropy'] < 400000), np.where(entropy_adm_2['Entropy'] > 40000))
plt.scatter(np.array(entropy_adm_2['Entropy']), np.array(malaria_adm_2['MalariaPerPop']), c='r')
plt.scatter(np.array(entropy_adm_2['Entropy'])[entropy_outliers_2], np.array(malaria_adm_2['MalariaPerPop'])[entropy_outliers_2])
plt.show()
plt.scatter(np.array(entropy_adm_2['Entropy'])[entropy_outliers_2], np.array(malaria_adm_2['MalariaPerPop'])[entropy_outliers_2])
plt.show()

Compute Log of activity, and re-plot

In [97]:
# Log Entropy - Adm_1]
log_entropy_outliers_1 = np.intersect1d(np.where(np.log(entropy_adm_1['Entropy']) > 12), np.where(entropy_adm_1['Entropy'] < 14.5))
plt.scatter(np.log(np.array(entropy_adm_1['Entropy'])), np.array(malaria_adm_1['MalariaPerPop']), c='r')
plt.scatter(np.log(np.array(entropy_adm_1['Entropy'])[entropy_outliers_1]), np.array(malaria_adm_1['MalariaPerPop'])[entropy_outliers_1], c='b')
plt.show()

# Log Entropy - Adm_2
plt.scatter(np.log(np.array(entropy_adm_2['Entropy'])), np.array(malaria_adm_2['MalariaPerPop']))
plt.show()
log_entropy_outliers_2 = np.where(entropy_adm_2['Entropy'] < 1500000)
plt.scatter(np.log(np.array(entropy_adm_2['Entropy'])[log_entropy_outliers_2]), np.array(malaria_adm_2['MalariaPerPop'])[log_entropy_outliers_2])
plt.show()

Compute the PMCC

In [83]:
print pearsonr(np.log(np.array(entropy_adm_1['Entropy'])[entropy_outliers_1]), np.array(malaria_adm_1['MalariaPerPop'])[entropy_outliers_1])
print pearsonr(np.log(np.array(entropy_adm_2['Entropy'])[entropy_outliers_2]), np.array(malaria_adm_2['MalariaPerPop'])[entropy_outliers_2])

(-0.82190221906172267, 0.0019089460123217345)
(-0.85359046310972087, 0.00082958250817005837)
(-0.44736927319657804, 0.010250183181427001)
(-0.49222544091474013, 0.0042142222527266045)
