In [1]:
import geopandas as gpd
from geopandas import *
import matplotlib.pyplot as plt
import pandas as pd

# Read in the Data Frame from Stata files and Clean the Dataframes

In [2]:
#IPUMS Wage data
Path_IPUMS = ("/Users/idiosyncrasy58/Dropbox/Documents/College/"+
              "Universitat Autonoma de Barcelona/IDEA - Economics/"+
              "Doctoral Thesis Ideas/Migration/IFLS/Project Files/"+
              "IPUMS/Project Files/Census Wage Data.dta")

#IFLS Wage and Schooling data by Province
Path_IFLS = ("/Users/idiosyncrasy58/Dropbox/Documents/College/"+
             "Universitat Autonoma de Barcelona/IDEA - Economics/"+
             "Doctoral Thesis Ideas/Migration/IFLS/Project Files/")

#Master Path
Master_path = ("/Users/idiosyncrasy58/Dropbox/Documents/College/"+
               "Universitat Autonoma de Barcelona/IDEA - Economics/"+
               "Doctoral Thesis Ideas/Migration/IFLS/Project Files/Maps/"+
               "Python Visualization of Indonesian Statistics/Visualizations/"+
               "Creation of Shape Files and Visualizations/"+
               "Shape Files with Indonesian BPS Codes" )

### IPUMS

In [3]:
#Read in the Data frame from Stata and clean: IPUMS Census Data

cols_keep = ["year", "r_wage_hr", "kabmov", "provmov", "MaxSchYrs", "perwt", "age"]

Census_Data = ( pd.read_stata(Path_IPUMS, convert_categoricals=False)[cols_keep] 
                  .rename(columns={'provmov':'Prov Code','kabmov':'Kab Code'})
                  .reset_index()
                  .rename(columns={'index':'serial'}) )

#Recatogorize MaxSchYrs to 13 if it is greater than 12 years (at least some college)
Census_Data.ix[Census_Data["MaxSchYrs"]>12.0,"MaxSchYrs"]=13.0

#Change Kab Code to remove the province number that preceeds the Kab Code
Census_Data["Kab Code"] = ( (Census_Data["Kab Code"]/100)
                            .astype(str)
                            .str.extract('\.(\d*)',expand=False)
                            .astype('int8') )

#Change age to int
Census_Data = Census_Data[np.isfinite(Census_Data["age"])]
Census_Data["age"] = Census_Data["age"].astype('int8')

#Seperate the Children
Census_Data_Child = Census_Data[(Census_Data.age<15) & (Census_Data["Prov Code"]!=54)]

#Drop if year is not 1995, if age is less than 15 years old, and drop age column
Census_Data = Census_Data.loc[(Census_Data.year!=1976) & (Census_Data.age>=15)].drop('age', axis=1, inplace=False)

### IFLS

#### Tracker File of individuals

In [4]:
cols_keep = ["wave", "pwt", "pidlink2", "provmov", "kabmov", "age", "MaxSchYrs", "flag_LastWave", "flag_OutSch", "flag_NotInSch"]
col_rename = {'wave':'year','pwt':'perwt','pidlink2':'serial','provmov':'Prov Code','kabmov':'Kab Code'}

Tracker = ( pd.read_stata(Path_IFLS+"MasterTrack2.dta", convert_categoricals=False)[cols_keep]
              .rename(columns=col_rename) )

#Keep only those people that have completed school or never went to school
Tracker = Tracker.loc[(Tracker["flag_LastWave"]==1) &
                      ((Tracker["flag_OutSch"]==1) | (Tracker["flag_NotInSch"]==1))]

#Drop the flag variables 
Tracker.drop(["flag_LastWave", "flag_OutSch", "flag_NotInSch"], axis=1, inplace=True)

#Replace several columns as an int and not as a double
Tracker[["serial","Prov Code","Kab Code","age"]] = Tracker[["serial","Prov Code","Kab Code","age"]].astype(int, copy=False)

#### Wages of Individuals: Adults

In [5]:
cols_keep = ["year", "pidlink2", "r_wage_hr","job","age","provmov"]
col_rename = {'pidlink2':'serial',"provmov":"Prov Code"}

Wages = ( pd.read_stata(Path_IFLS+"Temp Files/Wage Database1.dta", convert_categoricals=False)[cols_keep]
            .rename(columns=col_rename) )

#Make age, provincial location, serial an int and drop missing values
Wages = Wages[(np.isfinite(Wages.age)) & 
              (np.isfinite(Wages["Prov Code"])) & 
              (np.isfinite(Wages['serial'])) & 
              (np.isfinite(Wages.r_wage_hr))]

Wages[["age","Prov Code","serial"]] = Wages[["age","Prov Code","serial"]].astype(int,copy=False)

#drop if job==2
Wages = Wages.loc[Wages["job"]!=2].drop("job",axis=1, inplace=False)

#Keep only those who are below the age of 15 - defined as children in all surveys and drop missing wages
Wages_Child = Wages[Wages.age<15]

#Remove children from the Adult Wages dataset and drop age and provmov columns and all missing wages
Wages = Wages.loc[Wages.age>=15].drop(["age","Prov Code"], axis=1, inplace=False)

##### Clean Adult Wages

First, make sure that we have only one yearly observation per person (so get the mean across year-persons)

In [6]:
Wages = Wages.groupby(["serial","year"],as_index=False)["r_wage_hr"].mean()

Now find those people whose last observation doesn't have a wage in it. We will fill it in with a collapse by the last observed wage and place it in the last observed year. Those who already have a wage in their last observed year keep that wage (the proceeding code is a way to create dummy variables based on logical conditions).

In [7]:
Wages['flag_LastObs'] = (Wages.serial != Wages.serial.shift(-1)).astype(int)
Wages['flag_miss_LastWage'] = ((Wages.flag_LastObs==1) & (np.isnan(Wages['r_wage_hr']))).astype(int)

Now we will keep in a new dataframe all those people who had a missing value in the position of where their last observation is and use the merge (a m:1 merge) as a trick to identify all their rows. We will then collapse after a groupby to have their last year observed filled in by a wage if they had a wage in the preceeding year(s).

In [8]:
Wages_Missing = Wages.ix[Wages["flag_miss_LastWage"]==1,["serial","flag_miss_LastWage"]]

#Drop flags from main dataframe
Wages.drop(["flag_miss_LastWage",'flag_LastObs'],axis=1,inplace=True)

#Find all rows of those who have missing wages in their last observed survey year
Wages_Missing = ( Wages.merge(Wages_Missing,how='inner',on="serial")
                       .groupby("serial", as_index=False)
                       .last()
                       .drop("flag_miss_LastWage",axis=1, inplace=False) )

In [9]:
#Replace the years of last observations to get only the wave years
Wages_Missing.ix[Wages_Missing["year"]==1992,"year"]=1993
Wages_Missing.ix[(Wages_Missing["year"]==1996)|(Wages_Missing["year"]==1995),"year"]=1997
Wages_Missing.ix[Wages_Missing["year"]==1999,"year"]=2000
Wages_Missing.ix[(Wages_Missing["year"]==2006)|(Wages_Missing["year"]==2008),"year"]=2007

#Drop NaN values in wages (no need to keep these people)
Wages_Missing.dropna(inplace=True)

Now we will update the Wages dataframe with the nonmissing values from the Wages_Missing dataframe. First we must set as the index the year and the serial number, otherwise the update process will be done on the index value (which, as we know, are just the integer values 0-len(dataframe)). So we will have some people's data update other's data based not on serial number, but rather on index value.

In [10]:
Wages.set_index(["serial","year"],inplace=True)
Wages_Missing.set_index(["serial","year"],inplace=True)

In [11]:
Wages.update(Wages_Missing)
del Wages_Missing

#### Merge Wages into Tracker File

In [12]:
Wages = (Tracker.set_index(["serial","year"],inplace=False)
                .merge(Wages, how='left',left_index=True,right_index=True,copy=False))

Read in the IFLS East wage data set to update values in Tracker data

In [13]:
cols_keep = cols_keep[0:3]

Wages_2012 = ( pd.read_stata(Path_IFLS+"Temp Files/2012 Wage Current.dta")[cols_keep]
                .rename(columns=col_rename))

Wages_2012[["serial","year"]] = Wages_2012[["serial","year"]].astype(int,copy=False)

Wages_2012.set_index(["serial","year"],inplace=True)

Update the Tracker data

In [14]:
Wages.update(Wages_2012)
del Wages_2012

#### Append IPUMS File

Append the IPUMS file and then generate the log hourly wages (erasing the real wages)

In [15]:
Wages = Wages.reset_index().append(Census_Data,ignore_index=True)
del Census_Data

In [16]:
Wages["ln_wage_hr"]=np.log(Wages["r_wage_hr"])
Wages.drop("r_wage_hr",axis=1,inplace=True)

  if __name__ == '__main__':


Correct the provinces that split after 1993

In [17]:
Wages.ix[Wages["Prov Code"]==94,"Prov Code"]=91
Wages.ix[Wages["Prov Code"]==82,"Prov Code"]=81
Wages.ix[Wages["Prov Code"]==76,"Prov Code"]=73
Wages.ix[Wages["Prov Code"]==36,"Prov Code"]=32

Wages = Wages.loc[Wages["Prov Code"]!=54]

# Generate the Provincial Median Wages and the Average Schooling Years

### Provincial Median Wages and Average Schooling

In [18]:
#Lambda function for weighted average of schooling years 
#wm = lambda x: np.average(x, weights=Tracker.loc[x.index,"perwt"])

#Define dictionary with functions to apply
function = {'MaxSchYrs':'mean','ln_wage_hr':'median'}

Avg_Prov = Wages.groupby("Prov Code", as_index=False)[["MaxSchYrs","ln_wage_hr"]].agg(function)

In [19]:
Avg_Prov["r_wage_hr"] = np.exp(Avg_Prov["ln_wage_hr"])
Avg_Prov.drop("ln_wage_hr",axis=1,inplace=True)

# Import Shape Files

In [20]:
Ind_Prov = GeoDataFrame.from_file(Master_path+"/Prov_merge/Prov_merge.shp")

### Merge in the statistical information into the Geopandas Dataframes

In [21]:
Ind_Prov_Adult = Ind_Prov.merge(Avg_Prov, how='left', left_on="Prov Code", right_on="Prov Code")

In [22]:
#Fix Missing Values based on the regrouping of Provinces to mimic the 1993
#IFLS indonesia (these provinces had data but were folded into the parent
#province from whence they came)

# Ind_Prov.ix[27,"r_wage_hr"]=Ind_Prov.ix[23,"r_wage_hr"]
# Ind_Prov.ix[27,"ln_wage_hr"]=Ind_Prov.ix[23,"ln_wage_hr"]
# Ind_Prov.ix[27,"Average Schooling"]=Ind_Prov.ix[23,"Average Schooling"]
# Ind_Prov.ix[27,"Median Schooling"]=Ind_Prov.ix[23,"Median Schooling"]

Ind_Prov_Adult.ix[19,"MaxSchYrs"]=5.550605

Ind_Prov_Adult.ix[15,"r_wage_hr"]=Ind_Prov_Adult.ix[11,"r_wage_hr"]
Ind_Prov_Adult.ix[15,"MaxSchYrs"]=Ind_Prov_Adult.ix[11,"MaxSchYrs"]

Ind_Prov_Adult.ix[28,"r_wage_hr"]=Ind_Prov_Adult.ix[25,"r_wage_hr"]
Ind_Prov_Adult.ix[28,"MaxSchYrs"]=Ind_Prov_Adult.ix[25,"MaxSchYrs"]

Ind_Prov_Adult.ix[30,"r_wage_hr"]=Ind_Prov_Adult.ix[29,"r_wage_hr"]
Ind_Prov_Adult.ix[30,"MaxSchYrs"]=Ind_Prov_Adult.ix[29,"MaxSchYrs"]

Ind_Prov_Adult.ix[32,"r_wage_hr"]=Ind_Prov_Adult.ix[31,"r_wage_hr"]
Ind_Prov_Adult.ix[32,"MaxSchYrs"]=Ind_Prov_Adult.ix[31,"MaxSchYrs"]

Ind_Prov_Adult = Ind_Prov_Adult.dropna()

In [23]:
Ind_Prov_Adult

Unnamed: 0,Prov Code,Province,geometry,MaxSchYrs,r_wage_hr
0,11,Aceh,"(POLYGON ((95.1129443594893 5.566434697270201,...",9.51521,2.16414
1,12,Sumatera Utara,(POLYGON ((98.56304499441775 1.610842788660926...,8.292707,1.747396
2,13,Sumatera Barat,(POLYGON ((100.3340171959681 -3.28001314087794...,8.507731,1.990209
3,14,Riau,(POLYGON ((102.4940984916833 1.129889926323819...,9.32177,2.561491
4,15,Jambi,(POLYGON ((104.2475432565595 -1.03299575925923...,8.672121,2.107741
5,16,Sumatera Selatan,(POLYGON ((104.567944798962 -1.751620662897045...,7.491643,1.592167
6,17,Bengkulu,(POLYGON ((101.3849622989467 -2.34334692501442...,9.682692,2.195564
7,18,Lampung,(POLYGON ((105.5067143072264 -5.92385168879810...,7.193323,1.324909
8,19,Kep. Bangka Belitung,(POLYGON ((107.5251739146149 -3.19027266510658...,7.78169,2.388251
9,21,Kep. Riau,(POLYGON ((103.4574064112914 0.640523894383146...,9.308511,2.29272


# Plot the Chloropleths

### Wages

In [24]:
%matplotlib osx

( Ind_Prov_Adult.plot(column="r_wage_hr", cmap='Blues', scheme="quantiles", linewidth=0.1, legend=True)
         .axis('off') 
)
plt.title('Median Hourly Wages of Adults (INT$, Age ≥ 15 yrs)')

<matplotlib.text.Text at 0x10d563fd0>

### Schooling

In [25]:
%matplotlib osx

( Ind_Prov_Adult.plot(column="MaxSchYrs", cmap='Reds', scheme="quantiles", linewidth=0.1, legend=True)
         .axis('off') 
)
plt.title('Average Schooling of Adults (Years)')

<matplotlib.text.Text at 0x10f639518>

# Wages and Schooling of Children

Find those in child wages with duplicated wages and collapse by person and province

In [26]:
Wages_Child["Dup"] = Wages_Child.duplicated("serial",False).astype('int')

In [27]:
Wages_Child_Coll = Wages_Child[Wages_Child.Dup==1].drop(["Dup","year"],axis=1,inplace=False)
Wages_Child = Wages_Child[Wages_Child.Dup!=1].drop(["Dup","year"],axis=1,inplace=False)

In [28]:
Wages_Child_Coll = Wages_Child_Coll.groupby(["Prov Code","serial"], as_index=False).mean()

In [29]:
Wages_Child = Wages_Child.append(Wages_Child_Coll,ignore_index=True)

In [30]:
del Wages_Child_Coll

Merge in the schooling of child wages

In [31]:
Wages_Child = Wages_Child.merge(Tracker.loc[:,["serial","MaxSchYrs"]], 
                            how='left', left_on='serial',right_on='serial',copy=False)

Append the Child Wages from Ipums

In [32]:
Wages_Child = Wages_Child.append(Census_Data_Child.loc[:,["serial","r_wage_hr","MaxSchYrs","age","Prov Code"]])
del Census_Data_Child

Create log wages

In [33]:
Wages_Child["ln_wage_hr"] = np.log(Wages_Child["r_wage_hr"])
Wages_Child.drop("r_wage_hr",axis=1,inplace=True)

  if __name__ == '__main__':


Collapse provinces according to 1993 administration designation of provinces

In [34]:
#Collapse provinces to 1993 admin regions of Indonesia
Wages_Child.loc[Wages_Child["Prov Code"]==94,"Prov Code"] = 91
Wages_Child.loc[Wages_Child["Prov Code"]==82,"Prov Code"] = 81
Wages_Child.loc[Wages_Child["Prov Code"]==76,"Prov Code"] = 73
Wages_Child.loc[Wages_Child["Prov Code"]==36,"Prov Code"] = 32

Aggregate variables by Provinces

In [35]:
function={'MaxSchYrs':'mean','age':'mean','ln_wage_hr':'median','serial':'count'}
Avg_Prov_Child = Wages_Child.groupby("Prov Code", as_index=False).agg(function)

Avg_Prov_Child["r_wage_hr"] = np.exp(Avg_Prov_Child.ln_wage_hr)

Drop provinces where there is only one observation of a wage

In [36]:
Avg_Prov_Child = Avg_Prov_Child[Avg_Prov_Child.serial>1].drop(["age","serial","ln_wage_hr"],axis=1,inplace=False)

Merge into the geopandas dataframe

In [37]:
Ind_Prov_Child = Ind_Prov.merge(Avg_Prov_Child, how='left', left_on="Prov Code", right_on="Prov Code")

In [38]:
#Fix Missing Values based on the regrouping of Provinces to mimic the 1993
#IFLS indonesia (these provinces had data but were folded into the parent
#province from whence they came)

Ind_Prov_Child.set_index("Prov Code", inplace=True)

Ind_Prov_Child.ix[19,"r_wage_hr"]=Ind_Prov_Child.ix[18,"r_wage_hr"]
Ind_Prov_Child.ix[19,"MaxSchYrs"]=Ind_Prov_Child.ix[18,"MaxSchYrs"]

Ind_Prov_Child.ix[21,"r_wage_hr"]=Ind_Prov_Child.ix[14,"r_wage_hr"]
Ind_Prov_Child.ix[21,"MaxSchYrs"]=Ind_Prov_Child.ix[14,"MaxSchYrs"]

Ind_Prov_Child.ix[36,"r_wage_hr"]=Ind_Prov_Child.ix[32,"r_wage_hr"]
Ind_Prov_Child.ix[36,"MaxSchYrs"]=Ind_Prov_Child.ix[32,"MaxSchYrs"]

Ind_Prov_Child.ix[(74,76),"r_wage_hr"]=Ind_Prov_Child.ix[73,"r_wage_hr"]
Ind_Prov_Child.ix[(74,76),"MaxSchYrs"]=Ind_Prov_Child.ix[73,"MaxSchYrs"]

Ind_Prov_Child.ix[82,"r_wage_hr"]=Ind_Prov_Child.ix[81,"r_wage_hr"]
Ind_Prov_Child.ix[82,"MaxSchYrs"]=Ind_Prov_Child.ix[81,"MaxSchYrs"]

Ind_Prov_Child.ix[94,"r_wage_hr"]=Ind_Prov_Child.ix[91,"r_wage_hr"]
Ind_Prov_Child.ix[94,"MaxSchYrs"]=Ind_Prov_Child.ix[91,"MaxSchYrs"]

Ind_Prov_Child = Ind_Prov_Child.dropna()

## Wages

In [39]:
%matplotlib osx
( Ind_Prov_Child.plot(column="r_wage_hr", cmap='Blues', scheme="quantiles", linewidth=0.1, legend=True)
         .axis('off')
)
plt.title('Median Hourly Wages of Children (INT$, Age < 15 yrs)')

<matplotlib.text.Text at 0x119481f60>

## Schooling

In [40]:
%matplotlib osx
( Ind_Prov_Child.plot(column="MaxSchYrs", cmap='Reds', scheme="quantiles", linewidth=0.1, legend=True)
         .axis('off') 
)
plt.title('Average Schooling of Children Who Worked (Years)')

<matplotlib.text.Text at 0x11c2d7d30>