In [1]:
from pyspark.sql.functions import *
from pyspark.sql import Window

In [2]:
# Load Data Function
def loadDf(fileName):
    dt = spark.read.format('delta').options(header='true').load(fileName)
    return dt

In [3]:
# Kiva Loans
dtKivaLoans = loadDf("dbfs:/user/hive/warehouse/kiva_loans_1_csv")

dtKivaLoans.display(n=40)

# - id: Unique ID of the loan
# - funded_amount: Total amount funded for the loan
# - loan_amount: Total amount of the loan requested by borrower
# - activity: The business or economic sector the loan supports
# - sector: Top-level sector the activity belongs to
# - use: The purpose of the loan (e.g. to purchase equipment)
# - country_code: Two-letter ISO country code of country in which loan was disbursed
# - country: Full country name in which loan was disbursed
# - region: Geographic region in which loan was disbursed
# - currency: Original currency of the loan
# - partner_id: ID of the field partner organization
# - posted_time: The date and time at which the loan was posted on Kiva
# - disbursed_time: The date and time at which the loan was disbursed by the field partner
# - funded_time: The date and time at which the loan was fully funded by Kiva lenders
# - term_in_months: Loan term (duration) in months
# - lender_count: The number of Kiva lenders who funded the loan
# - tags: Space-separated list of attributes and characteristics describing the loan
# - borrower_genders: Comma-separated list of genders of the borrowers
# - repayment_interval: Frequency at which the loan is repaid (e.g. monthly)
# - date


# Kiva Mpi Region Locations
dtKivaMpiRegionLocations = loadDf("dbfs:/user/hive/warehouse/kiva_mpi_region_locations")

# LocationName: name of the location
# ISO: ISO code for the location
# country: name of the country
# region: region within the country
# world_region: name of the world region (e.g. "Asia")
# MPI: Multidimensional Poverty Index (MPI) value for the location
# geo: geography of the location
# lat: latitude coordinate of the location
# lon: longitude coordinate of the location

dtKivaMpiRegionLocations.display(n=40)

# Loan Theme Ids
dtLoanThemeIds = loadDf("dbfs:/user/hive/warehouse/loan_theme_ids")

# id: Identifier of the loan (bigint)
# Loan Theme ID: Identifier of the loan theme (string)
# Loan Theme Type: Type of the loan theme (string)
# Partner ID: Identifier of the partner (double)

dtLoanThemeIds.display(n=40)

# Loan Themes By Region 
dtLoanThemesByRegion = loadDf("dbfs:/user/hive/warehouse/loan_themes_by_region")

# Partner ID: Identifier of the partner (bigint)
# Field Partner Name: Name of the field partner (string)
# Sector: Sector of the loan (string)
# Loan Theme ID: Identifier of the loan theme (string)
# Loan Theme Type: Type of the loan theme (string)
# Country: Name of the country (string)
# Forkiva: Forkiva information (string)
# Region: Region of the location (string)
# Geocode Old: Old geocode information (string)
# ISO: ISO code for the location (string)
# Number: Number identifier (bigint)
# Amount: Amount of the loan (bigint)
# LocationName: Name of the location (string)
# Geocode: Geocode information (string)
# Names: Names information (string)
# Geo: Geography information (string)
# Lat: Latitude coordinate of the location (double)
# Lon: Longitude coordinate of the location (double)
# MPI Region: Region of the MPI (string)
# MPI Geo: Geography of the MPI (string)

dtLoanThemesByRegion.display(n=40)

id,funded_amount,loan_amount,activity,sector,use,country_code,country,region,currency,partner_id,posted_time,disbursed_time,funded_time,term_in_months,lender_count,tags,borrower_genders,repayment_interval,date
653051,300.0,300.0,Fruits & Vegetables,Food,"To buy seasonal, fresh fruits to sell.",PK,Pakistan,Lahore,PKR,247.0,2014-01-01 06:12:39+00:00,2013-12-17 08:00:00+00:00,2014-01-02 10:06:32+00:00,12.0,12.0,,female,irregular,2014-01-01
653053,575.0,575.0,Rickshaw,Transportation,to repair and maintain the auto rickshaw used in their business.,PK,Pakistan,Lahore,PKR,247.0,2014-01-01 06:51:08+00:00,2013-12-17 08:00:00+00:00,2014-01-02 09:17:23+00:00,11.0,14.0,,"female, female",irregular,2014-01-01
653068,150.0,150.0,Transportation,Transportation,To repair their old cycle-van and buy another one to rent out as a source of income,IN,India,Maynaguri,INR,334.0,2014-01-01 09:58:07+00:00,2013-12-17 08:00:00+00:00,2014-01-01 16:01:36+00:00,43.0,6.0,"user_favorite, user_favorite",female,bullet,2014-01-01
653063,200.0,200.0,Embroidery,Arts,to purchase an embroidery machine and a variety of new embroidery materials.,PK,Pakistan,Lahore,PKR,247.0,2014-01-01 08:03:11+00:00,2013-12-24 08:00:00+00:00,2014-01-01 13:00:00+00:00,11.0,8.0,,female,irregular,2014-01-01
653084,400.0,400.0,Milk Sales,Food,to purchase one buffalo.,PK,Pakistan,Abdul Hakeem,PKR,245.0,2014-01-01 11:53:19+00:00,2013-12-17 08:00:00+00:00,2014-01-01 19:18:51+00:00,14.0,16.0,,female,monthly,2014-01-01
1080148,250.0,250.0,Services,Services,purchase leather for my business using ksh 20000.,KE,Kenya,,KES,,2014-01-01 10:06:19+00:00,2014-01-30 01:42:48+00:00,2014-01-29 14:14:57+00:00,4.0,6.0,,female,irregular,2014-01-01
653067,200.0,200.0,Dairy,Agriculture,To purchase a dairy cow and start a milk products business .,IN,India,Maynaguri,INR,334.0,2014-01-01 09:51:02+00:00,2013-12-16 08:00:00+00:00,2014-01-01 17:18:09+00:00,43.0,8.0,"user_favorite, user_favorite",female,bullet,2014-01-01
653078,400.0,400.0,Beauty Salon,Services,to buy more hair and skin care products.,PK,Pakistan,Ellahabad,PKR,245.0,2014-01-01 11:46:01+00:00,2013-12-20 08:00:00+00:00,2014-01-10 18:18:44+00:00,14.0,8.0,"#Elderly, #Woman Owned Biz",female,monthly,2014-01-01
653082,475.0,475.0,Manufacturing,Manufacturing,"to purchase leather, plastic soles and heels in different sizes along with spools of thread.",PK,Pakistan,Lahore,PKR,245.0,2014-01-01 11:49:43+00:00,2013-12-20 08:00:00+00:00,2014-01-01 18:47:21+00:00,14.0,19.0,user_favorite,female,monthly,2014-01-01
653048,625.0,625.0,Food Production/Sales,Food,"to buy a stall, gram flour, ketchup, and coal for selling ladoo.",PK,Pakistan,Lahore,PKR,247.0,2014-01-01 05:41:03+00:00,2013-12-17 08:00:00+00:00,2014-01-03 15:45:04+00:00,11.0,24.0,,female,irregular,2014-01-01


LocationName,ISO,country,region,world_region,MPI,geo,lat,lon
"Badakhshan, Afghanistan",AFG,Afghanistan,Badakhshan,South Asia,0.387,"(36.7347725, 70.81199529999999)",36.7347725,70.81199529999998
"Badghis, Afghanistan",AFG,Afghanistan,Badghis,South Asia,0.466,"(35.1671339, 63.7695384)",35.1671339,63.7695384
"Baghlan, Afghanistan",AFG,Afghanistan,Baghlan,South Asia,0.3,"(35.8042947, 69.2877535)",35.80429470000001,69.28775350000001
"Balkh, Afghanistan",AFG,Afghanistan,Balkh,South Asia,0.301,"(36.7550603, 66.8975372)",36.7550603,66.8975372
"Bamyan, Afghanistan",AFG,Afghanistan,Bamyan,South Asia,0.325,"(34.8100067, 67.8212104)",34.8100067,67.8212104
"Daykundi, Afghanistan",AFG,Afghanistan,Daykundi,South Asia,0.313,"(33.669495, 66.0463534)",33.669495,66.0463534
"Farah, Afghanistan",AFG,Afghanistan,Farah,South Asia,0.319,"(32.4464635, 62.1454133)",32.4464635,62.1454133
"Faryab, Afghanistan",AFG,Afghanistan,Faryab,South Asia,0.25,"(36.0795613, 64.90595499999999)",36.0795613,64.90595499999999
"Ghazni, Afghanistan",AFG,Afghanistan,Ghazni,South Asia,0.245,"(33.5450587, 68.4173972)",33.5450587,68.41739720000001
"Ghor, Afghanistan",AFG,Afghanistan,Ghor,South Asia,0.384,"(34.0995776, 64.90595499999999)",34.0995776,64.90595499999999


id,Loan Theme ID,Loan Theme Type,Partner ID
638631,a1050000000skGl,General,151.0
640322,a1050000000skGl,General,151.0
641006,a1050000002X1ij,Higher Education,160.0
641019,a1050000002X1ij,Higher Education,160.0
641594,a1050000002VbsW,Subsistence Agriculture,336.0
642256,a1050000000T3oX,Extreme Poverty,217.0
642311,a1050000000snTK,General,57.0
642353,a1050000000T3oX,Extreme Poverty,217.0
642386,a1050000000wf0f,General,133.0
642429,a1050000000wezJ,General,48.0


Partner ID,Field Partner Name,sector,Loan Theme ID,Loan Theme Type,country,forkiva,region,geocode_old,ISO,number,amount,LocationName,geocode,names,geo,lat,lon,mpi_region,mpi_geo,rural_pct
9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000000slfi,Higher Education,Cambodia,No,Banteay Meanchey,"(13.75, 103.0)",KHM,1,450,"Banteay Meanchey, Cambodia","[(13.6672596, 102.8975098)]",Banteay Meanchey Province; Cambodia,"(13.6672596, 102.8975098)",13.6672596,102.8975098,"Banteay Mean Chey, Cambodia","(13.6672596, 102.8975098)",90.0
9,KREDIT Microfinance Institution,General Financial Inclusion,a10500000068jPe,Vulnerable Populations,Cambodia,No,Battambang Province,,KHM,58,20275,"Battambang Province, Cambodia","[(13.0286971, 102.989615)]",Battambang Province; Cambodia,"(13.0286971, 102.989615)",13.0286971,102.989615,"Banteay Mean Chey, Cambodia","(13.6672596, 102.8975098)",90.0
9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000000slfi,Higher Education,Cambodia,No,Battambang Province,,KHM,7,9150,"Battambang Province, Cambodia","[(13.0286971, 102.989615)]",Battambang Province; Cambodia,"(13.0286971, 102.989615)",13.0286971,102.989615,"Banteay Mean Chey, Cambodia","(13.6672596, 102.8975098)",90.0
9,KREDIT Microfinance Institution,General Financial Inclusion,a10500000068jPe,Vulnerable Populations,Cambodia,No,Kampong Cham Province,"(12.0, 105.5)",KHM,1383,604950,"Kampong Cham Province, Cambodia","[(12.0982918, 105.3131185)]",Kampong Cham Province; Cambodia,"(12.0982918, 105.3131185)",12.0982918,105.3131185,"Kampong Cham, Cambodia","(11.9924294, 105.4645408)",90.0
9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000002X1Uu,Sanitation,Cambodia,No,Kampong Cham Province,"(12.0, 105.5)",KHM,3,275,"Kampong Cham Province, Cambodia","[(12.0982918, 105.3131185)]",Kampong Cham Province; Cambodia,"(12.0982918, 105.3131185)",12.0982918,105.3131185,"Kampong Cham, Cambodia","(11.9924294, 105.4645408)",90.0
9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000000slfi,Higher Education,Cambodia,No,Kampong Cham Province,"(12.0, 105.5)",KHM,36,62225,"Kampong Cham Province, Cambodia","[(12.0982918, 105.3131185)]",Kampong Cham Province; Cambodia,"(12.0982918, 105.3131185)",12.0982918,105.3131185,"Kampong Cham, Cambodia","(11.9924294, 105.4645408)",90.0
9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000007VvXr,Solar Home Systems,Cambodia,No,Kampong Cham Province,"(12.0, 105.5)",KHM,2,1300,"Kampong Cham Province, Cambodia","[(12.0982918, 105.3131185)]",Kampong Cham Province; Cambodia,"(12.0982918, 105.3131185)",12.0982918,105.3131185,"Kampong Cham, Cambodia","(11.9924294, 105.4645408)",90.0
9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000000weyk,General,Cambodia,No,Kampong Chhnang Province,"(12.0, 104.5)",KHM,249,237175,"Kampong Chhnang Province, Cambodia","[(12.1392352, 104.5655273)]",Kampong Chhnang Province; Cambodia,"(12.1392352, 104.5655273)",12.1392352,104.5655273,"Kampong Chhnang, Cambodia","(12.1392352, 104.5655273)",90.0
9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000007VvXr,Solar Home Systems,Cambodia,No,Kampong Chhnang Province,"(12.0, 104.5)",KHM,7,3050,"Kampong Chhnang Province, Cambodia","[(12.1392352, 104.5655273)]",Kampong Chhnang Province; Cambodia,"(12.1392352, 104.5655273)",12.1392352,104.5655273,"Kampong Chhnang, Cambodia","(12.1392352, 104.5655273)",90.0
9,KREDIT Microfinance Institution,General Financial Inclusion,a1050000000slfi,Higher Education,Cambodia,No,Kampong Chhnang Province,"(12.0, 104.5)",KHM,18,31425,"Kampong Chhnang Province, Cambodia","[(12.1392352, 104.5655273)]",Kampong Chhnang Province; Cambodia,"(12.1392352, 104.5655273)",12.1392352,104.5655273,"Kampong Chhnang, Cambodia","(12.1392352, 104.5655273)",90.0


In [4]:
# For the locations in which Kiva has active loans, your objective is to pair Kiva's 
# data with additional data sources to estimate the welfare level of borrowers in specific
# regions, based on shared economic and demographic characteristics.

In [5]:
# 1. What is the difference in the distribution of loans and MPI by country for each region?
dtKivaLoansByCountry = dtKivaLoans \
    .join(dtKivaMpiRegionLocations, ["country", "region"], "inner") \
    .groupBy("country","region") \
    .agg(count("id").alias("Number of Loans"),avg("MPI").alias("MPI_avg")) \
    .orderBy(desc("MPI_avg"),desc("Number of Loans")) \
    .select("country", "region","Number of Loans", "MPI_avg")


dtKivaLoansByCountry.display(n=40)



country,region,Number of Loans,MPI_avg
Sierra Leone,Port Loko,110,0.5219999999999995
Timor-Leste,Ermera,252,0.4969999999999999
Mali,Sikasso,273,0.4839999999999998
Sierra Leone,Kenema,170,0.4769999999999994
Sierra Leone,Bo,288,0.4489999999999997
Timor-Leste,Viqueque,83,0.4100000000000001
Timor-Leste,Aileu,67,0.3789999999999997
Timor-Leste,Baucau,186,0.3569999999999997
Malawi,Salima,12,0.334
Yemen,Dhamar,105,0.3249999999999998


In [6]:
# 1. What is the difference in the distribution of loans and MPI by gender and sector?
# Explode and split used because one row can have more than one person
dtKivaLoansByCountry = dtKivaLoans \
    .join(dtKivaMpiRegionLocations, ["country", "region"], "inner") \
    .withColumn("gender", explode(split(col("borrower_genders"), ", "))) \
    .groupBy("country") \
    .agg(
        avg("MPI").alias("MPI_avg"),
        count(when(col("gender") == "male", 1)).alias("Male_Count"),
        count(when(col("gender") == "female", 1)).alias("Female_Count")
    ) \
    .select("country", "MPI_avg", "Male_Count", "Female_Count")


dtKivaLoansByCountry.display(n=40)

country,MPI_avg,Male_Count,Female_Count
Yemen,0.174979166666665,421,1067
Malawi,0.2742180509128297,55,3834
Cambodia,0.1372014154234629,6374,23723
Lesotho,0.106,5,5
Nicaragua,0.0340127458370447,3294,11299
Peru,0.0575824847250551,3729,9528
Sierra Leone,0.4640717884131,315,1273
Timor-Leste,0.3494375788146278,47,746
Tajikistan,0.0210929899856938,348,1050
Nigeria,0.3110000000000068,8825,1174
