#### Conversion of columns

In [3]:
import pandas as pd
from data_transform import DataTransform

df = pd.read_csv("loan_payments.csv")

# Initialize DataTransform class
transformer = DataTransform(df)

# Apply conversions
transformer.convert_to_int('term')
transformer.convert_employment_length('employment_length')
columns_to_convert_to_datetime = [
    'issue_date',
    'earliest_credit_line',
    'last_payment_date',
    'next_payment_date',
    'last_credit_pull_date'
]
transformer.convert_multiple_to_datetime(
    columns_to_convert_to_datetime, date_format='%b-%Y'
    )
transformer.convert_to_category('loan_status')

columns_to_convert_to_date = [
    'next_payment_date',
    'last_payment_date',
    'last_credit_pull_date'
]
transformer.convert_to_date(
    columns_to_convert_to_date
    )

#transformer.drop_na('employment_length')

new_df = transformer.get_dataframe()

#### Describe DataFrame

In [4]:
import pandas as pd
from dataframe_info import DataFrameInfo

# Initialize DataFrameInfo class
info = DataFrameInfo(new_df)

# Describe columns
print("\nDescribe Columns")
print(info.describe_columns())



Describe Columns
id                                      int64
member_id                               int64
loan_amount                             int64
funded_amount                         float64
funded_amount_inv                     float64
term                                    int64
int_rate                              float64
instalment                            float64
grade                                  object
sub_grade                              object
employment_length                     float64
home_ownership                         object
annual_inc                            float64
verification_status                    object
issue_date                     datetime64[ns]
loan_status                          category
payment_plan                           object
purpose                                object
dti                                   float64
delinq_2yrs                             int64
earliest_credit_line           datetime64[ns]
inq_last_6mths  

In [5]:
# Extract statistical values
print("\nStatistical Values")
print(info.extract_statistical_values())


Statistical Values
                  id     member_id   loan_amount  funded_amount  \
median  7.084590e+06  8.709873e+06  12000.000000   12000.000000   
std     9.571362e+06  1.031281e+07   8082.196709    8019.017599   
mean    7.621797e+06  8.655350e+06  13333.076100   13229.509117   

        funded_amount_inv       term   int_rate  instalment  \
median       11300.000000  36.000000  13.160000  347.150000   
std           8099.473527  15.826533   4.392893  238.920012   
mean         12952.622979  38.857111  13.507328  400.013953   

        employment_length    annual_inc  ...  total_payment_inv  \
median           6.000000  61000.000000  ...        9835.830000   
std              3.649479  51589.339577  ...        8363.508506   
mean             5.690749  72220.848249  ...       11788.946618   

        total_rec_prncp  total_rec_int  total_rec_late_fee  recoveries  \
median      7644.920000    1734.640000            0.000000    0.000000   
std         6958.124264    2581.657345   

## Key Insights from Statistical Analysis

1. **Loan_amount** = Suggests typical loan amount is around £12,000, there are some loans with significantly higher amounts, leading to a higher mean than standard deviation. 
2. **Term** = The median term is 36 months, with a mean of 38.86 months and a standard deviation of 15.83. This indicates that most loans have a term of around 36 months, but there are some loans with longer terms. 
3. **Interest Rate (int_rate)** = The median interest rate is 13.16%, with a mean of 13.50 and standard deviation of 4.39%. This suggests that while the typical interest rate is around 13.6%, there is some variation in the interest rates offered.
4. **Annual Income (annual_inc)** = Suggests typical income is around £61,000, with a mean of £72,220.85 and standard deviation of £51,589.34. Whilst typical Annual Income is around £61,000 there are some borrowers with significantly higher incomes, leading to a higher mean than standard deviation. 
5. **Total Payment (total_payment_inv)** = The median total payment is £9,835.83, with a mean of £11,788.95 and standard deviation of £8,363.51. This suggests that although the typical total payment is around £9,835.83, there are some loans with significantly higher total payments. 

In [6]:
# Count distinct values in categorical columns
print("\nDistinct Values in Categorical Columns")
print(info.count_distinct_values())

# Print DataFrame shape
print("\nDataFrame Shape")
print(info.print_shape())

# Count NULL values
print("\nNULL Values")
print(info.count_null_values())


Distinct Values in Categorical Columns
grade                      7
sub_grade                 35
home_ownership             5
verification_status        3
loan_status                9
payment_plan               2
purpose                   14
last_payment_date         98
next_payment_date         96
last_credit_pull_date    101
application_type           1
dtype: int64

DataFrame Shape
(54231, 43)

NULL Values
                             null_count  null_percentage
id                                    0             0.00
member_id                             0             0.00
loan_amount                           0             0.00
funded_amount                      3007             5.54
funded_amount_inv                     0             0.00
term                                  0             0.00
int_rate                           5169             9.53
instalment                            0             0.00
grade                                 0             0.00
sub_grade       

In [7]:
# Get summary statistics
print("\nSummary")
summary = info.get_summary()
rounded_summary = summary.round(2)
print(rounded_summary)


Summary
                id    member_id  loan_amount  funded_amount  \
count     54231.00     54231.00     54231.00       51224.00   
mean    7621797.48   8655349.93     13333.08       13229.51   
min       55521.00     70694.00       500.00         500.00   
25%      759433.00    958772.00      7000.00        7000.00   
50%     7084590.00   8709873.00     12000.00       12000.00   
75%     8860616.50  10527137.00     18000.00       18000.00   
max    38676116.00  41461848.00     35000.00       35000.00   
std     9571361.65  10312806.88      8082.20        8019.02   

       funded_amount_inv      term  int_rate  instalment  employment_length  \
count           54231.00  54231.00  49062.00    54231.00           52113.00   
mean            12952.62     38.86     13.51      400.01               5.69   
min                 0.00      0.00      5.42       15.67               0.00   
25%              6700.00     36.00     10.37      224.20               2.00   
50%             11300.00    

#### NULL values:

In [8]:
from dataframe_transform import DataFrameTransform
from plotter import Plotter

transformer = DataFrameTransform(new_df)

null_cols = transformer.count_null_values()
print(null_cols)

                             null_count  null_percentage
funded_amount                      3007             5.54
int_rate                           5169             9.53
employment_length                  2118             3.91
mths_since_last_delinq            31002            57.17
mths_since_last_record            48050            88.60
last_payment_date                    73             0.13
next_payment_date                 32608            60.13
last_credit_pull_date                 7             0.01
collections_12_mths_ex_med           51             0.09
mths_since_last_major_derog       46732            86.17


In [9]:
plotter_before = Plotter(new_df)
plotter_before.plot_null_values()

In [10]:
#Columns with 50%> Null values drop:
columns_to_drop = [
    'mths_since_last_delinq', 
    'mths_since_last_record', 
    'next_payment_date', 
    'mths_since_last_major_derog'
    ]
cleaned_df=transformer.drop_columns(columns_to_drop)

#Impute missing values with either median or mean:
cleaned_df=transformer.impute_missing_values(cleaned_df, strategy='median')

In [11]:
#Data after cleaning data:
plotter_after = Plotter(cleaned_df)
plotter_after.plot_null_values()

In [12]:
# Save a separate copy of the DataFrame
cleaned_df.to_csv("loan_payments_copy.csv", index=False)


**Skewness**

In [13]:
numberical_cols = cleaned_df.select_dtypes(include=['float64','int64'])
skewness = numberical_cols.skew().abs()
print("Skewness of columns:\n", skewness)

Skewness of columns:
 id                             2.370336
member_id                      2.205422
loan_amount                    0.805259
funded_amount                  0.869922
funded_amount_inv              0.813927
term                           0.707703
int_rate                       0.456515
instalment                     0.996981
employment_length              0.127558
annual_inc                     8.711831
dti                            0.189420
delinq_2yrs                    5.370002
inq_last_6mths                 3.248918
open_accounts                  1.059282
total_accounts                 0.779014
out_prncp                      2.356426
out_prncp_inv                  2.356848
total_payment                  1.267891
total_payment_inv              1.256197
total_rec_prncp                1.261015
total_rec_int                  2.204322
total_rec_late_fee            13.184305
recoveries                    14.589793
collection_recovery_fee       27.636843
last_payment_amoun

In [15]:
import numpy as np 

# Identify skewed columns
numerical_columns = cleaned_df.select_dtypes(include=['float64', 'int64'])
skewness = numerical_columns.skew().abs()
threshold = 1.0
skewed_columns = skewness[skewness > threshold].index

# Transform skewed columns
transformed_df = transformer.transform_skewed_cols(skewed_columns)




Processing column: id
Log skewness for id: -0.3568574884783667
Sqrt skewness for id: -0.31360639762840314
Box-Cox skewness for id: -0.0958255583896153

Best transformation for id: log

Processing column: member_id
Log skewness for member_id: -0.3808185143762741
Sqrt skewness for member_id: -0.3364977850480336
Box-Cox skewness for member_id: -0.10234779233405839

Best transformation for member_id: log

Processing column: annual_inc
Log skewness for annual_inc: -0.1549348484259427
Sqrt skewness for annual_inc: -0.09209493052868675
Box-Cox skewness for annual_inc: 0.01492602732768253

Best transformation for annual_inc: log

Processing column: delinq_2yrs
Log skewness for delinq_2yrs: 1.8694335980252368
Sqrt skewness for delinq_2yrs: 1.8694337446680684
Box-Cox skewness for delinq_2yrs: 1.8693827700010048

Best transformation for delinq_2yrs: boxcox

Processing column: inq_last_6mths
Log skewness for inq_last_6mths: 0.19030055341570726
Sqrt skewness for inq_last_6mths: 0.19841583775970173