## Exploratory Data Analysis of Lending Club Dataset
** Note - All cells are collapsed to avoid longer output screen. The notebook is basically practised on Databricks community edition.

In [None]:
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
from pyspark.sql.types import *

In [None]:
%sql
select * from processes_loan_ver2 limit 10

In [None]:
loan_data = spark.table('processes_loan_ver2')
display(loan_data)

In [None]:
display(loan_data.describe())

In [None]:
#Check which state has more customers.
loan_data.groupBy('addr_state').count().orderBy(desc('count')).show(5)
#California, Texas and Ney York the top 3 states.

In [None]:
# Checking which state has most number of 'bad_loans'
display(loan_data.filter(loan_data.bad_loan=='Yes').groupBy('addr_state').count().orderBy(desc('count')))
#Surprising - Florida has more bad loans though it is no on the top 3 list of consumers

In [None]:
grade_data = loan_data.groupBy("grade").agg(sum('loan_amnt')).orderBy(desc('sum(loan_amnt)'))
display(grade_data)

In [None]:
#Among each grade how much loan amount is accumulated in the bad_loan
display(loan_data.groupBy('grade', 'bad_loan').agg(sum('loan_amnt')))
#The graph shows that loans under G grade have high chances of defaulters because 13% of it fall under bad loan. This scenario improves from G through A.

In [None]:
# Create an exposure column to indicate weigh the revol_balance based on the bad loan. If the loan is bad loan then we multiply it with -10, where
# - is to show that it is bad loan and 10 to weigh the intensity of it for revol_bal.
exposure_data = loan_data.withColumn('exposure',when(loan_data.bad_loan=='No',col('revol_bal')).otherwise(-10*col('revol_bal')))
display(exposure_data)

In [None]:
display(exposure_data.groupBy('grade', 'bad_loan').agg(sum('exposure')))
# Inference - Even though G grade loan have many defaulters it is regarded with low risk under exposure when the amount of revol_bal is taken in account.
# Where as C category loans are under higher risk of loss because the revol_bal of the customers is high.

In [None]:
# Analysing the outliers from the loan amount data
display(spark.sql('''
          select loan_amnt from processes_loan_ver2
          '''))

In [None]:
#Checking the distribution of the loan amount
display(spark.sql('''
          select loan_amnt from processes_loan_ver2
          '''))
# Inference - Not exactly normal distribution as the data points lie around the normal line and not on the line.

In [None]:
strip_percent = udf(lambda x: x.strip('%'))
loan_data = loan_data.withColumn('int_rate_clean', strip_percent(loan_data.int_rate))
loan_data = loan_data.withColumn('int_rate_clean', col('int_rate_clean').cast('float'))
loan_data.show(5)

In [None]:
loan_data.printSchema()

In [None]:
#Analyse the average interest rate based on the bad_loan
display(loan_data.groupBy('bad_loan').agg(avg('int_rate_clean')))

In [None]:
display(loan_data.select(['home_ownership', 'bad_loan', 'loan_amnt']))
#Distribution of the loan amount varies according to the type of home ownership. The risk of bad loan is minimum along in the category of RENT with around 11k loan amnt.
#Lesser the monthly installment in rent home ownership less probable they are defaulter.

In [None]:
display(loan_data.groupBy('grade', 'purpose').count())
#Majority of the reasons that customers approach lending club accross all the grades of loan is debt_consolidation and credit card.

In [None]:
display(loan_data.groupBy('grade', 'loan_status').agg(count('loan_status')))