In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
sns.set()
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Read the data to get the overview of tables

In [None]:
df = pd.read_csv('/kaggle/input/data-science-for-good-kiva-crowdfunding/kiva_loans.csv', index_col=0)
df

## Statistical Overview of the Data


In [None]:
df.describe(include='all')

In [None]:
df.info()

In [None]:
df.columns


# Data Preparation
## checking for the missing data

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(['partner_id', 'disbursed_time', 'funded_time', 'tags'], axis = 1)

## Checking for the presence of unique values in each column 

In [None]:
for i in df.columns:
    print('\n\n', i, df[i].unique())

In [None]:
df['borrower_genders'].value_counts()

## Adding a new column 'Gender'

I want to simplify the 'borrower_gender' column to male, female, multiple female, multiple male, both and uknown so that it is easy to performn further analysis

In [None]:
# Created a function to correct the multiple entry of the gender
def multiple_entry_genders(borrower_genders):
    gender_list = borrower_genders.split(",")
    gender_list = list(set(gender_list))
    gender_list = [borrower_genders.strip() for borrower_genders in gender_list]
    if len(gender_list)==2:
        if 'female' in gender_list and 'male' in gender_list:
            return 'both'
        elif 'female' in gender_list:
            return 'multiple female'
        elif 'male' in gender_list:
            return 'multiple male'
    elif gender_list[0]=='female':
        return 'female'
    elif gender_list[0]=='male':
        return 'male'
    else:
        return 'unknown'

In [None]:
df['borrower_genders'][df['borrower_genders'].isnull()]= 'unknown'
df['gender'] = df['borrower_genders'].apply(multiple_entry_genders)

New column name gender was added with new values 'female', 'multiple female', 'male', 'multiple male', 'both','unknown'

In [None]:
df['gender'].unique()

In [None]:
df.head()

## Identification of features

Rate of success is important feature to the gender dependency of loan application so I calculated the rate of sucess by dividing the funded amount with loan amount.

In [None]:
df_gender_con= df.groupby(['country', 'gender'], as_index=False).agg({'funded_amount':'sum', 'loan_amount':'sum', 'term_in_months':'mean', 'lender_count':'sum'})

In [None]:
df_gender_con['rate_of_success'] = round((df_gender_con['funded_amount']/df_gender_con['loan_amount'])*100)

In [None]:
df_gender_con

#  Data visualization¶

##   Number of loans requested per country 
overview of countries where the most loan requests are mostly coming from.

In [None]:
# grouping by country and sorting the values based on counted values
df_country = df.groupby(['country'], as_index = False)['funded_amount'].count()

df_country.sort_values(by=['funded_amount'], ascending=False, inplace=True)


In [None]:
# Importing the iso_country codes to plot the choropleth graphs
df_country_codes = pd.read_csv('/kaggle/input/countries-iso-codes/wikipedia-iso-country-codes.csv')

#renameing the column names
df_country_codes.columns = ['country','Alpha-2 code','Alpha-3 code','Numeric code', 'ISO 3166-2']

In [None]:
#combining the two data frames to get the country codes
df_country_merge_count = pd.merge(df_country_codes, df_country, on='country', how='right')
df_country_merge_count.columns = ['country','Alpha-2 code','Alpha-3 code','Numeric code', 'ISO 3166-2', 'Total_loans']
df_country_merge_count.head(2)

In [None]:
fig = px.choropleth(df_country_merge_count, locations="Alpha-3 code",
                    color="Total_loans", # lifeExp is a column of gapminder
                   hover_name="country", # column to add to hover information
                   color_continuous_scale=px.colors.sequential.Plasma)

fig.update_layout(
    title_text = 'Total number of loans requested per country')
fig.show()

## Conclusion
Pilippines is one of the countries which requested for more loans.

There are several african countries are in the list such as Kenya, Mali, Nigeria, Ghana etc involved in this loans.

No europian coutries are involved in this loans

Surprisingly, USA was also present in this list, as it doesn't have the same poverty rate as the developing countries but it can be a specific case

According to the graph top 10 countries loan applications are from mostly by developing countries.

# Total loan amount requested per country 

Features are grouped based on country and calculated the mean and count for the loan amount.Let us have a look wheather number of loans requested per country is similar for the total loan amount requested. 

In [None]:
# Grouped by country and calculated the loan amount sum and count
df_country_id = df.groupby(['country']).agg({'loan_amount': ['sum','count']}).reset_index()
df_country_id.head(2)

In [None]:
# drop the top headings
df_country_id.columns = df_country_id.columns.droplevel()

df_country_id.columns

In [None]:
# rename the column names
df_country_id.columns = ['country', 'Total_loan_amount', 'loan_count']

In [None]:
df_country_id

In [None]:
## combining the two data frames to get the country codes
df_country_merge = pd.merge(df_country_codes, df_country_id, on='country', how='right')


In [None]:
df_country_merge = df_country_merge.drop(['Alpha-2 code', 'Numeric code', 'ISO 3166-2'], axis = 1)
df_country_merge

In [None]:
fig = px.choropleth(df_country_merge, locations="Alpha-3 code",
                    color="Total_loan_amount", # lifeExp is a column of gapminder
                   hover_name="country", # column to add to hover information
                   color_continuous_scale=px.colors.sequential.Plasma)


fig.update_layout(
    title_text = 'Total loan amount requested per country')

fig.show()

While the Phlippines still leads in request for total loan amount, I also see the US and some areas in South America have more total loan amount requests than other countries.

Except Philippines all other Asian counties applied for less amount of loans. In Africa, Kenya applied for more funding. In South America, Peru and Paraguay applied for more funding.

In USA, number of loans requested is less but total amount requested per country is more

In most of the cases, total loan amount is depend on country number of loans requested in developing countries.

## Gender dependancy on total amount of loans per country

In above two graphs, I checked wheather the total number of loans requested are depended on total amount of loan requested. It is clear that mostly developing countries are depending on each other.

Now I want to check weather gender dependancy is present on the total amount of loans per country

In order to answer this I grouped based on the country and gender and colculated the total funded amount.

In [None]:
df_gender_con

In [None]:
#fig = go.Figure()
fig = px.sunburst(df_gender_con, path=['gender', 'country'], values='funded_amount', 
                  width =800, height =800)
fig.show()

### Conclusion
As I can see that most borrowers are female, I didn't expect that and it was actually a great surprise. This means that women are getting funded and work on their projects in their countries (most of them are under developed or developing countries).

For loans with multiple borrowers, there are a lot more female borrower groups than mixed or multiple male groups.

As per my previous two graphs (5.1 and 5.2 graphs), total loan amount and number of loans requested are mostly Philippines. However, most of the applicants who got loans in Philippines are females compared to males.

In case of males, united states and Kenya got more loans. However in case of multiple females, the paraguay females got more loans.

## Loan success rate is on gender dependancy

I conditionally added a new column based on the rate of success column. I setup the condition that less than 70% means low success rate, greater than or equal to 95% means high success rate and between 70% and 94% moderate success rate.

In [None]:
df_gender_con['new_rate_of_success'] = np.where(df_gender_con['rate_of_success']< 70, 'low success rate', 
                                               np.where(df_gender_con['rate_of_success']>=95, 'high success rate', 'moderate success rate'))

In [None]:
fig = px.sunburst(df_gender_con, path=['gender', 'new_rate_of_success'], values='rate_of_success', width =800, height =800)

fig.show()

## Conclusion

Based on this graph, the female has high success rate to get the loans compared to men. If it is more than one female the sucess rate of getting loan is very high. This seems to be there are somehow encouraging women to do some work. In most of the developing countries or under developed countries they always provide some reservations or subsidies for the women for the loans. This might be the reason why female has high success rate to get loans compared to male.


# Activities, sectors and funding amounts by females

## Females vs Sectors

Now let's have a look how females are using these loans for. First I plot with sectors. There were 15 unique sectors.

In [None]:
df_sec_female = df.loc[df['gender'].isin(['female', 'multiple female'])]

In [None]:
plt.figure(figsize=(13,8))
df_sec_female1 = df_sec_female['sector'].value_counts()
sns.barplot(y=df_sec_female1.index, x=df_sec_female1.values, alpha=0.6)
plt.xlabel('Number of loans', fontsize=16)
plt.ylabel("Sectors", fontsize=16)
plt.title("Number of loans per sector", fontsize= 18)
plt.show()

## Conclusion

The most dominant sector females are spending on agriculture. This really surprises me because in most of the countries males concentrate on agriculture compared to females. 

From the graph(5.3b) it is clear that females has high chances to get a loan than males.It could be reason that loans are applying on female names but agriculture job could be done by males in there households.

Food sector and retails sector occupies an important part too because many people are looking to buy fish, vegetables and stocks for their businesses to keep running.

## Activities involved by females

In [None]:
plt.figure(figsize=(15,10))
df_acti_female= df_sec_female['activity'].value_counts().head(30)
sns.barplot(y=df_acti_female.index, x=df_acti_female.values, alpha=0.6)
plt.ylabel("Activity", fontsize=16)
plt.xlabel('Number of loans', fontsize=16)
plt.title("Number of loans per activy", fontsize=18)
plt.show()

## Conclusion

Most of the activities are all about basic daily needs or small businesses like buying and reselling etc.

General stores is one of the activities which comes in top, in developing countries most of the females choose this as a first option so this activity makes more sense in the graph. 

This plot also confirms the previous findings, activities related to agriculture come in top position: Farming, Food production, pigs etc.