# JSON  / World Bank Exercise
## 1) Find the 10 countries with most projects

In [2]:
#Import packages necessary for entire exercise.
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize

In [3]:
#For first question, directly read in JSON data as Pandas Data Frame : df
df = pd.read_json('data/world_bank_projects.json')

In [4]:
#Examine nature of Data Frame.
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Data columns (total 50 columns):
_id                         500 non-null object
approvalfy                  500 non-null int64
board_approval_month        500 non-null object
boardapprovaldate           500 non-null object
borrower                    485 non-null object
closingdate                 370 non-null object
country_namecode            500 non-null object
countrycode                 500 non-null object
countryname                 500 non-null object
countryshortname            500 non-null object
docty                       446 non-null object
envassesmentcategorycode    430 non-null object
grantamt                    500 non-null int64
ibrdcommamt                 500 non-null int64
id                          500 non-null object
idacommamt                  500 non-null int64
impagency                   472 non-null object
lendinginstr                495 non-null object
lendinginstrtype            495 non

In [5]:
#Large number of columns above requires expansion to further examine nature of data in each.
pd.set_option("display.max_columns", 50)

In [6]:
#After further examination, column 'countryshortname' appears to be suitable for counting number of times a country appears based on projects undertaken.
#First check that projects are not duplicated.
df.project_name.is_unique

True

In [7]:
#Now count number of times each country appears and list the top 10
df.countryshortname.value_counts().head(10)

China                 19
Indonesia             19
Vietnam               17
India                 16
Yemen, Republic of    13
Bangladesh            12
Nepal                 12
Morocco               12
Africa                11
Mozambique            11
Name: countryshortname, dtype: int64

In [8]:
#'Africa' is listed as a country! A closer look at the data shows that, strictly speaking, Tanzania 
#should be in the top 10 as it appears twice under the umbrella of 'Africa' entries too.

## 2) Find the top 10 major project themes (using column 'mjtheme_namecode')

In [9]:
#From examination of data above, column 'mjtheme_namecode' contains lists of dictionaries.
#To manipulate in Data Frame means data needs normalizing and so must be intially loaded in string format.
data = json.load((open('data/world_bank_projects.json')))
theme = json_normalize(data, 'mjtheme_namecode')

In [10]:
#Examination of data using theme.info() and theme.code.nunique() / theme.name.nunique() reveals discrepancy of missing names.
#Further look at extent of discrepancy across all 'code' values by creating new column: 'themecon'
theme['themecon'] = theme['code'] + ' : ' + theme['name']
#and then examining number of mismatches
theme.groupby(theme.themecon).size().sort_values(ascending=False)

themecon
11 : Environment and natural resources management    223
10 : Rural development                               202
8 : Human development                                197
2 : Public sector governance                         184
6 : Social protection and risk management            158
4 : Financial and private sector development         130
7 : Social dev/gender/inclusion                      119
5 : Trade and integration                             72
9 : Urban development                                 47
1 : Economic management                               33
11 :                                                  27
4 :                                                   16
2 :                                                   15
10 :                                                  14
8 :                                                   13
3 : Rule of law                                       12
7 :                                                   11
6 :                   

In [11]:
#Widespread omissions of 'name' values.
#In answering question, decided to only look at completed entries.
theme.groupby(['code', 'name']).size().sort_values(ascending=False).head(10)

code  name                                        
11    Environment and natural resources management    223
10    Rural development                               202
8     Human development                               197
2     Public sector governance                        184
6     Social protection and risk management           158
4     Financial and private sector development        130
7     Social dev/gender/inclusion                     119
5     Trade and integration                            72
9     Urban development                                47
1     Economic management                              33
dtype: int64

## 3) In 2. above you will notice that some entries have only the code and the name is missing. Create a dataframe with the missing names filled in.

In [12]:
#For easier manipulation, remove blank string entries in 'word' column and replace with Numpy NaNs.
theme = theme.replace('', np.NaN)

In [13]:
#Then sort values on 2 indexes so that layout of data allows use of 'Fill' method.
#Concurrently make sure NaNs are grouped and situated to see that method has worked.
theme = theme.sort_values(['code', 'name'], na_position = 'first').fillna(method='bfill')

In [14]:
#Quick look at Data Frame 'theme' to see missing names filled in by correct code.
theme.head()

Unnamed: 0,code,name,themecon
212,1,Economic management,1 :
363,1,Economic management,1 :
1024,1,Economic management,1 :
1114,1,Economic management,1 :
1437,1,Economic management,1 :


In [15]:
#Forgot to delete additional column from earlier! Final tidy up.
del theme['themecon']