In [34]:
import pandas as pd
import numpy as np
from faker import Faker
import random
import openpyxl
from sklearn.pipeline import Pipeline
from datetime import datetime, timedelta
from sklearn.base import BaseEstimator, TransformerMixin

# **Generate Fake Data**

In [35]:
def random_date(start, end):
    return start + timedelta(days=random.randint(0, int((end - start).days)))

np.random.seed(42)
num_records = 10000

loan_ids = np.arange(1, num_records + 1)
disbursement_dates = [
    random_date(datetime(2020, 1, 1), datetime(2023, 1, 1)) for _ in range(num_records)
]
expire_dates = [d + timedelta(days=random.randint(30, 365)) for d in disbursement_dates]
is_employed = np.random.choice([True, False, np.nan], num_records, p=[0.7, 0.25, 0.05])
loan_amounts = np.random.choice(
    [np.nan, *np.random.uniform(-1000, 50000, num_records)], num_records
)
number_of_defaults = np.random.choice(
    [np.nan, *np.random.randint(0, 10, num_records)], num_records
)
outstanding_balances = np.random.choice(
    [np.nan, *np.random.uniform(0, 30000, num_records)], num_records
)
interest_rates = np.random.choice(
    [np.nan, *np.random.uniform(0.01, 0.25, num_records)], num_records
)
ages = np.random.choice([np.nan, *np.random.randint(18, 65, num_records)], num_records)
remaining_terms = np.random.choice(
    [np.nan, *np.random.randint(1, 60, num_records)], num_records
)
salaries = np.random.choice(
    [np.nan, *np.random.uniform(-10000, 20000, num_records)], num_records
)
loan_statuses = np.random.choice(
    ["Default", "Non-Default", np.nan], num_records, p=[0.3, 0.65, 0.05]
)
sectors = np.random.choice(
    ["Agriculture", "Manufacturing", "Services", "IT", "Retail"], num_records
)
currencies = np.random.choice(["USD", "EUR", "ZWL", "GBP", "AUD"], num_records)

loan_data = pd.DataFrame(
    {
        "loan_id": loan_ids,
        "disbursement_date": disbursement_dates,
        "expire_date": expire_dates,
        "is_employed": is_employed,
        "loan_amount": loan_amounts,
        "number_of_defaults": number_of_defaults,
        "outstanding_balance": outstanding_balances,
        "interest_rate": interest_rates,
        "age": ages,
        "remaining_term": remaining_terms,
        "salary": salaries,
        "sector": sectors,
        "currency": currencies,
        "loan_status": loan_statuses,
    }
)


num_duplicates = 100
duplicates = loan_data.sample(n=num_duplicates, replace=True).reset_index(drop=True)
loan_data = pd.concat([loan_data, duplicates], ignore_index=True)

num_missing_loan_ids = 100 
missing_loan_ids_indices = np.random.choice(loan_data.index, num_missing_loan_ids, replace=False)
loan_data.loc[missing_loan_ids_indices, 'loan_id'] = np.nan



In [36]:
loan_data.head(3)

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,loan_status
0,1.0,2022-05-02,2022-12-27,1.0,28982.170288,4.0,29869.899084,0.11234,38.0,49.0,4908.585021,Agriculture,USD,Non-Default
1,2.0,2022-04-21,2022-07-05,,27464.842065,5.0,27408.000182,0.128941,30.0,30.0,2283.62718,Agriculture,EUR,Non-Default
2,3.0,2021-11-11,2022-03-14,0.0,24469.175558,2.0,19078.8504,0.224119,63.0,17.0,4093.206305,Services,USD,Non-Default


In [37]:
loan_data.shape

(10100, 14)

# Data Cleanign steps

- Data cleaning is the process of ensuring that data is in the proper format, *making it suitable for analysis and modeling*

### 1.Check missing values 

In [38]:
category_columns = loan_data.select_dtypes("object").columns
category_columns

Index(['sector', 'currency', 'loan_status'], dtype='object')

In [39]:
data = loan_data.copy()

In [40]:
data.columns = data.columns.str.strip()

In [41]:
duplicates = data.loc[data.duplicated(keep=False)].sort_values("loan_id")
duplicates

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,loan_status
565,566.0,2022-06-09,2022-09-06,1.0,29584.002026,3.0,5635.619645,0.161228,51.0,1.0,-1371.104398,Agriculture,EUR,Non-Default
10042,566.0,2022-06-09,2022-09-06,1.0,29584.002026,3.0,5635.619645,0.161228,51.0,1.0,-1371.104398,Agriculture,EUR,Non-Default
10022,584.0,2020-05-11,2020-10-05,1.0,49824.632395,4.0,6203.275291,0.145092,46.0,51.0,8781.064285,Manufacturing,ZWL,Non-Default
583,584.0,2020-05-11,2020-10-05,1.0,49824.632395,4.0,6203.275291,0.145092,46.0,51.0,8781.064285,Manufacturing,ZWL,Non-Default
10016,624.0,2020-08-17,2020-11-10,1.0,41858.436775,7.0,13601.620423,0.241174,52.0,39.0,6166.957191,Services,GBP,Non-Default
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10095,9572.0,2020-04-08,2021-03-06,1.0,44092.703584,8.0,6125.137358,0.200122,28.0,22.0,-6502.574603,Manufacturing,ZWL,Non-Default
10049,9598.0,2020-05-04,2020-09-08,1.0,46454.521799,3.0,11578.226615,0.179743,26.0,17.0,926.293383,Services,EUR,Default
9597,9598.0,2020-05-04,2020-09-08,1.0,46454.521799,3.0,11578.226615,0.179743,26.0,17.0,926.293383,Services,EUR,Default
9818,9819.0,2022-05-17,2022-10-11,1.0,45985.606996,5.0,25474.028912,0.223755,45.0,8.0,10133.342476,Services,EUR,Non-Default


In [42]:
def duplicates_check(df: pd.DataFrame):
    return data.loc[data.duplicated(keep=False)].sort_values("loan_id")

In [43]:
data

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,loan_status
0,1.0,2022-05-02,2022-12-27,1.0,28982.170288,4.0,29869.899084,0.112340,38.0,49.0,4908.585021,Agriculture,USD,Non-Default
1,2.0,2022-04-21,2022-07-05,,27464.842065,5.0,27408.000182,0.128941,30.0,30.0,2283.627180,Agriculture,EUR,Non-Default
2,3.0,2021-11-11,2022-03-14,0.0,24469.175558,2.0,19078.850400,0.224119,63.0,17.0,4093.206305,Services,USD,Non-Default
3,4.0,2020-05-17,2020-10-07,1.0,36218.404504,9.0,5269.500606,0.125747,30.0,7.0,13755.258629,Retail,GBP,Default
4,5.0,2021-12-24,2022-11-26,1.0,45353.259460,6.0,22912.296526,0.142642,40.0,10.0,14089.467579,Retail,EUR,Default
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10095,9572.0,2020-04-08,2021-03-06,1.0,44092.703584,8.0,6125.137358,0.200122,28.0,22.0,-6502.574603,Manufacturing,ZWL,Non-Default
10096,8153.0,2021-07-01,2021-09-18,1.0,38826.677860,3.0,13065.021063,0.176067,60.0,43.0,5589.307865,Manufacturing,ZWL,Non-Default
10097,5218.0,2020-03-06,2020-12-29,0.0,18377.762383,1.0,13154.175200,0.090369,19.0,39.0,-2993.929111,Retail,USD,Non-Default
10098,3798.0,2021-08-03,2022-02-24,1.0,45302.420621,2.0,950.523166,0.145554,29.0,50.0,13356.416566,IT,GBP,Non-Default


In [44]:
check_missng_loan_id = data.loc[data['loan_id'].isnull()]
check_missng_loan_id

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,loan_status
53,,2020-07-05,2020-08-25,0.0,30730.513133,7.0,4061.935542,0.238645,59.0,47.0,10353.529851,IT,EUR,Non-Default
112,,2022-12-04,2023-01-31,0.0,49913.460942,2.0,10552.667947,0.062663,30.0,49.0,13779.082017,Agriculture,AUD,Non-Default
154,,2020-12-17,2021-01-28,,7286.436822,8.0,8902.303883,0.166237,41.0,5.0,14347.684665,Retail,GBP,Default
217,,2021-03-29,2021-09-21,0.0,36932.134716,0.0,10657.427132,0.114187,58.0,53.0,11161.352277,Retail,EUR,Non-Default
274,,2022-10-30,2023-09-12,1.0,47893.584810,3.0,5746.335994,0.077749,55.0,28.0,14119.202242,Agriculture,AUD,Non-Default
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9462,,2021-11-23,2022-06-08,0.0,11177.525424,4.0,4738.301630,0.102162,22.0,23.0,12950.146868,Manufacturing,EUR,Non-Default
9541,,2021-09-21,2022-07-05,1.0,13923.249099,9.0,29406.178227,0.121751,21.0,33.0,-9263.268171,Retail,GBP,Non-Default
9624,,2021-05-21,2021-08-08,1.0,10207.265309,0.0,20054.453732,0.053756,41.0,55.0,18474.849927,Agriculture,ZWL,Non-Default
9753,,2021-10-14,2022-08-12,0.0,47608.665349,1.0,17688.503096,0.047802,48.0,46.0,4614.576638,Services,ZWL,Non-Default


In [45]:
def check_missng_loan_id(df:pd.DataFrame)->pd.DataFrame:
    return data.loc[data['loan_id'].isnull()]
    

In [46]:
class CheckMissingLoanId(BaseEstimator,TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        return X.loc[X['loan_id'].isnull()]
        

In [47]:
mising= CheckMissingLoanId()
mising.fit_transform(data)

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,loan_status
53,,2020-07-05,2020-08-25,0.0,30730.513133,7.0,4061.935542,0.238645,59.0,47.0,10353.529851,IT,EUR,Non-Default
112,,2022-12-04,2023-01-31,0.0,49913.460942,2.0,10552.667947,0.062663,30.0,49.0,13779.082017,Agriculture,AUD,Non-Default
154,,2020-12-17,2021-01-28,,7286.436822,8.0,8902.303883,0.166237,41.0,5.0,14347.684665,Retail,GBP,Default
217,,2021-03-29,2021-09-21,0.0,36932.134716,0.0,10657.427132,0.114187,58.0,53.0,11161.352277,Retail,EUR,Non-Default
274,,2022-10-30,2023-09-12,1.0,47893.584810,3.0,5746.335994,0.077749,55.0,28.0,14119.202242,Agriculture,AUD,Non-Default
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9462,,2021-11-23,2022-06-08,0.0,11177.525424,4.0,4738.301630,0.102162,22.0,23.0,12950.146868,Manufacturing,EUR,Non-Default
9541,,2021-09-21,2022-07-05,1.0,13923.249099,9.0,29406.178227,0.121751,21.0,33.0,-9263.268171,Retail,GBP,Non-Default
9624,,2021-05-21,2021-08-08,1.0,10207.265309,0.0,20054.453732,0.053756,41.0,55.0,18474.849927,Agriculture,ZWL,Non-Default
9753,,2021-10-14,2022-08-12,0.0,47608.665349,1.0,17688.503096,0.047802,48.0,46.0,4614.576638,Services,ZWL,Non-Default


In [48]:
num_columns = [
    "loan_amount",
    "number_of_defaults",
    "outstanding_balance",
    "interest_rate",
    "age",
    "remaining_term",
    "salary",
]

In [49]:
data.head()

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,loan_status
0,1.0,2022-05-02,2022-12-27,1.0,28982.170288,4.0,29869.899084,0.11234,38.0,49.0,4908.585021,Agriculture,USD,Non-Default
1,2.0,2022-04-21,2022-07-05,,27464.842065,5.0,27408.000182,0.128941,30.0,30.0,2283.62718,Agriculture,EUR,Non-Default
2,3.0,2021-11-11,2022-03-14,0.0,24469.175558,2.0,19078.8504,0.224119,63.0,17.0,4093.206305,Services,USD,Non-Default
3,4.0,2020-05-17,2020-10-07,1.0,36218.404504,9.0,5269.500606,0.125747,30.0,7.0,13755.258629,Retail,GBP,Default
4,5.0,2021-12-24,2022-11-26,1.0,45353.25946,6.0,22912.296526,0.142642,40.0,10.0,14089.467579,Retail,EUR,Default


In [50]:
def check_missing_values(df: pd.DataFrame):
    return data.loc[data.isnull().any(axis=1)]

In [51]:
data[data.isnull().any(axis=1)]

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,loan_status
1,2.0,2022-04-21,2022-07-05,,27464.842065,5.0,27408.000182,0.128941,30.0,30.0,2283.627180,Agriculture,EUR,Non-Default
11,12.0,2021-06-09,2021-07-27,,9899.361717,1.0,4842.795849,0.192387,27.0,38.0,-2135.138884,IT,EUR,Non-Default
34,35.0,2020-09-27,2021-01-15,,10926.266823,3.0,14616.721703,0.189646,24.0,24.0,2609.446486,Manufacturing,EUR,Non-Default
50,51.0,2020-03-13,2020-12-05,,34120.093785,0.0,9980.403155,0.243796,39.0,2.0,18455.388926,Agriculture,AUD,Default
53,,2020-07-05,2020-08-25,0.0,30730.513133,7.0,4061.935542,0.238645,59.0,47.0,10353.529851,IT,EUR,Non-Default
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10039,1997.0,2021-04-04,2021-09-16,,3563.397532,2.0,2531.808990,0.145159,40.0,47.0,6748.996043,Manufacturing,AUD,Default
10072,3503.0,2020-01-27,2020-10-28,,20812.338365,9.0,1943.779183,0.109077,37.0,37.0,15452.659139,Retail,ZWL,Non-Default
10075,5934.0,2021-05-22,2022-02-22,,7435.446130,9.0,10175.138972,0.085238,53.0,3.0,-9553.357284,Agriculture,USD,Default
10089,9346.0,2020-01-25,2021-01-24,,7979.418838,0.0,4575.428499,0.027103,30.0,1.0,19884.474437,Services,AUD,Non-Default


In [52]:
class CheckMissingValues(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.errors = pd.DataFrame()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        missing_counts = X.isnull().sum()
        missing_counts = missing_counts[missing_counts > 0]
        self.errors = (
            missing_counts.to_frame(name="Missing Values")
            .rename_axis("Column")
            .reset_index()
            .sort_values("Missing Values", ascending=False)
        )
        return self.errors

In [53]:
missing = CheckMissingValues()
frms = missing.fit_transform(data)
frms

Unnamed: 0,Column,Missing Values
1,is_employed,483
0,loan_id,100
7,remaining_term,2
4,outstanding_balance,2
3,number_of_defaults,1
2,loan_amount,1
5,interest_rate,1
6,age,1


In [54]:
check_missing_values = data.loc[data.isnull().any(axis=1)]
check_missing_values

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,loan_status
1,2.0,2022-04-21,2022-07-05,,27464.842065,5.0,27408.000182,0.128941,30.0,30.0,2283.627180,Agriculture,EUR,Non-Default
11,12.0,2021-06-09,2021-07-27,,9899.361717,1.0,4842.795849,0.192387,27.0,38.0,-2135.138884,IT,EUR,Non-Default
34,35.0,2020-09-27,2021-01-15,,10926.266823,3.0,14616.721703,0.189646,24.0,24.0,2609.446486,Manufacturing,EUR,Non-Default
50,51.0,2020-03-13,2020-12-05,,34120.093785,0.0,9980.403155,0.243796,39.0,2.0,18455.388926,Agriculture,AUD,Default
53,,2020-07-05,2020-08-25,0.0,30730.513133,7.0,4061.935542,0.238645,59.0,47.0,10353.529851,IT,EUR,Non-Default
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10039,1997.0,2021-04-04,2021-09-16,,3563.397532,2.0,2531.808990,0.145159,40.0,47.0,6748.996043,Manufacturing,AUD,Default
10072,3503.0,2020-01-27,2020-10-28,,20812.338365,9.0,1943.779183,0.109077,37.0,37.0,15452.659139,Retail,ZWL,Non-Default
10075,5934.0,2021-05-22,2022-02-22,,7435.446130,9.0,10175.138972,0.085238,53.0,3.0,-9553.357284,Agriculture,USD,Default
10089,9346.0,2020-01-25,2021-01-24,,7979.418838,0.0,4575.428499,0.027103,30.0,1.0,19884.474437,Services,AUD,Non-Default


In [55]:
def check_invalid_dates(df: pd.DataFrame):
    dates_invalid = df[df["disbursement_date"] > df["expire_date"]]
    return dates_invalid

In [56]:
dates_invalid = data[data["disbursement_date"] > data["expire_date"]]
dates_invalid

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,loan_status


In [57]:
mandatory_columns = [
    "loan_id",
    "disbursement_date",
    "expire_date",
    "is_employed",
    "loan_amount",
    "number_of_defaults",
    "outstanding_balance",
    "interest_rate",
    "age",
    "remaining_term",
    "salary",
    "sector",
    "currency",
    "employee_sector",
    "status",
    "loan_status",
]

In [58]:
def check_mandatory_columns(df: pd.DataFrame, mandatory_columns: list) -> list:
    missing_columns = [col for col in mandatory_columns if col not in df.columns]
    if missing_columns:
        return missing_columns
    else:
        return None

In [59]:
class MandatoryColumns(BaseEstimator, TransformerMixin):
    def __init__(self, mandatory_columns):
        self.mandatory_columns = mandatory_columns
        self.errors = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        self.errors = [col for col in self.mandatory_columns if col not in X.columns]
        return X

    def get_errors(self):
        return self.errors

In [60]:
num_columns_ck = ["loan_amount", "interest_rate", "age", "salary"]

In [61]:
conditions = [(data[col] < 0) | (data[col] == 0) for col in num_columns_ck]
mask = pd.concat(conditions, axis=1).any(axis=1)
check_negative_amounts_and_zeros = data[mask]
check_negative_amounts_and_zeros

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,loan_status
5,6.0,2021-05-01,2022-04-19,1.0,5974.506204,0.0,2492.023710,0.014792,63.0,47.0,-1529.950804,IT,ZWL,Non-Default
6,7.0,2022-01-15,2022-10-11,1.0,21934.173342,0.0,9747.824979,0.143750,22.0,36.0,-8570.523443,IT,ZWL,Non-Default
9,10.0,2020-03-28,2021-03-09,0.0,41763.628792,0.0,5985.145076,0.110431,21.0,31.0,-2405.424979,Agriculture,USD,Non-Default
10,11.0,2022-04-10,2022-08-13,1.0,31032.257305,8.0,8698.407086,0.098271,22.0,29.0,-2592.699823,Retail,AUD,Non-Default
11,12.0,2021-06-09,2021-07-27,,9899.361717,1.0,4842.795849,0.192387,27.0,38.0,-2135.138884,IT,EUR,Non-Default
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10087,5895.0,2022-08-24,2022-10-08,1.0,3284.087441,7.0,5478.738634,0.189075,20.0,3.0,-3379.838716,IT,USD,Non-Default
10092,6421.0,2020-10-03,2021-03-29,,49500.123014,9.0,9181.774370,0.019188,37.0,31.0,-2963.067074,IT,GBP,Non-Default
10093,4583.0,2020-10-15,2021-03-16,0.0,6672.876089,2.0,1034.449852,0.108230,27.0,46.0,-5506.703306,Retail,AUD,Default
10095,9572.0,2020-04-08,2021-03-06,1.0,44092.703584,8.0,6125.137358,0.200122,28.0,22.0,-6502.574603,Manufacturing,ZWL,Non-Default


In [62]:
def check_negative_amounts_and_zeros(df: pd.DataFrame):
    conditions = [(data[col] < 0) | (data[col] == 0) for col in num_columns_ck]
    mask = pd.concat(conditions, axis=1).any(axis=1)
    negative_amounts_and_zeros = data[mask]
    negative_amounts_and_zeros
    return negative_amounts_and_zeros

In [63]:
data[num_columns] = data[num_columns].apply(pd.to_numeric, errors="coerce")
not_converted_num = data.loc[data[num_columns].isnull().any(axis=1)]

In [64]:
def converted_num(df: pd.DataFrame):
    num_columns = [
        "loan_amount",
        "number_of_defaults",
        "outstanding_balance",
        "interest_rate",
        "age",
        "remaining_term",
        "salary",
    ]
    data[num_columns] = data[num_columns].apply(pd.to_numeric, errors="coerce")
    not_converted_num = data.loc[data[num_columns].isnull().any(axis=1)]
    return not_converted_num

In [65]:
not_converted_num

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,loan_status
427,428.0,2020-03-08,2021-01-22,0.0,49449.532816,,18552.233004,0.043065,35.0,25.0,19121.373969,IT,USD,Non-Default
1159,1160.0,2021-02-07,2021-04-03,1.0,30245.723605,3.0,4089.793095,,30.0,28.0,-6032.421389,Agriculture,AUD,Non-Default
2495,2496.0,2021-01-30,2021-11-24,1.0,17358.037769,1.0,6242.926831,0.194841,51.0,,8746.591592,Agriculture,GBP,Default
2498,2499.0,2021-07-21,2022-03-02,,48750.015415,6.0,,0.02808,27.0,6.0,7509.425339,Retail,AUD,Default
2837,2838.0,2020-06-15,2021-03-05,1.0,23774.989832,4.0,25581.870683,0.167046,31.0,,-9841.307132,Retail,AUD,Non-Default
3241,3242.0,2021-03-14,2022-01-04,1.0,10968.860177,4.0,22488.176453,0.21383,,11.0,6929.077248,Retail,AUD,Non-Default
5483,5484.0,2022-02-13,2022-12-12,0.0,47403.951535,2.0,,0.040812,44.0,7.0,11909.320534,Manufacturing,EUR,Non-Default
7381,7382.0,2021-10-17,2022-04-06,1.0,,6.0,16856.131988,0.226779,32.0,55.0,19476.052291,Retail,EUR,Default


In [66]:
class CheckMissingLoanId(BaseEstimator,TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        return X.loc[X['loan_id'].isnull()]

In [67]:
dates_columns = loan_data.filter(regex="date").columns
category_columns = loan_data.select_dtypes("object").columns

In [68]:
data[dates_columns] = data[dates_columns].applymap(lambda x: pd.to_datetime(x, format="%d/%m/%Y", errors="coerce"))

  data[dates_columns] = data[dates_columns].applymap(lambda x: pd.to_datetime(x, format="%d/%m/%Y", errors="coerce"))


In [69]:
not_converted_dates = data.loc[data[dates_columns].isnull().any(axis=1)]
not_converted_dates

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,loan_status


In [70]:
def converted_dates(df: pd.DataFrame) -> pd.DataFrame:
    dates_columns = df.filter(regex="date").columns
    df[dates_columns] = df[dates_columns].apply(
        lambda col: pd.to_datetime(col, format="%d/%m/%Y", errors="coerce")
    )
    not_converted_dates = df[df[dates_columns].isnull().any(axis=1)]
    return not_converted_dates

In [71]:
class DateConverter(BaseEstimator, TransformerMixin):
    def __init__(self, date_formats=None):
        self.errors = None
        self.date_formats = date_formats or [
            "%d/%m/%Y",
            "%Y-%m-%d",
            "%m/%d/%Y",
            "%d-%m-%Y",
            "%Y.%m.%d",
        ]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        dates_columns = X.filter(regex="date").columns
        X_temp = X.copy()

        not_converted_dates = (
            X_temp[dates_columns]
            .apply(
                lambda col: pd.to_datetime(
                    col, format=self.date_formats[0], errors="coerce"
                )
            )
            .loc[lambda df: df.isnull().any(axis=1)]
        )

        self.errors = not_converted_dates
        return not_converted_dates

In [72]:
mandatory_columns

['loan_id',
 'disbursement_date',
 'expire_date',
 'is_employed',
 'loan_amount',
 'number_of_defaults',
 'outstanding_balance',
 'interest_rate',
 'age',
 'remaining_term',
 'salary',
 'sector',
 'currency',
 'employee_sector',
 'status',
 'loan_status']

In [73]:
class ConvertedNumeric(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X):
        return self

    def transform(self, X):
        X = data.copy()
        num_columns = [
            "loan_amount",
            "number_of_defaults",
            "outstanding_balance",
            "interest_rate",
            "age",
            "remaining_term",
            "salary",
        ]
        X[num_columns] = X[num_columns].apply(pd.to_numeric, errors="coerce")
        not_converted_num = X.loc[X[num_columns].isnull().any(axis=1)]
        return not_converted_num

In [74]:
class CheckNegativeAmountsAndZeros(BaseEstimator, TransformerMixin):
    def __init__(self, num_columns_ck=None):
        self.num_columns_ck = num_columns_ck or [
            "loan_amount",
            "interest_rate",
            "age",
            "salary",
        ]
        self.errors = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        mask = X[self.num_columns_ck].apply(lambda col: (col <= 0)).any(axis=1)
        negative_amounts_and_zeros = X[mask]
        return negative_amounts_and_zeros

In [75]:
class CheckDuplicates(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.errors = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        duplicates = X[X.duplicated(keep=False)].sort_values("loan_id")
        return duplicates

In [76]:
data[data["disbursement_date"] > data["expire_date"]]

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,loan_status


In [77]:
class CheckInvalidDates(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.errors = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        invalid_dates = X[X["disbursement_date"] > X["expire_date"]]
        self.errors = invalid_dates
        return invalid_dates

In [78]:
data.columns

Index(['loan_id', 'disbursement_date', 'expire_date', 'is_employed',
       'loan_amount', 'number_of_defaults', 'outstanding_balance',
       'interest_rate', 'age', 'remaining_term', 'salary', 'sector',
       'currency', 'loan_status'],
      dtype='object')

In [79]:
pipeline = Pipeline(
    [
        ("mandatory_columns", MandatoryColumns(mandatory_columns=mandatory_columns)),
        ('check_missing_loan_id',CheckMissingLoanId()),
        ("check_missing_values", CheckMissingValues()),
        ("date_converter", DateConverter()),
        # ('check_invalid_dates', CheckInvalidDates()),
        ("convert_numeric", ConvertedNumeric()),
        ("check_negative_amounts_and_zeros", CheckNegativeAmountsAndZeros()),
        ("check_duplicates", CheckDuplicates()),
    ]
)

In [80]:
pipeline.fit(data)

In [81]:
df_cleaned = data.copy()
pipeline.fit(df_cleaned)
pipeline.transform(df_cleaned)

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,loan_status


In [82]:
with pd.ExcelWriter("data_issues.xlsx") as writer:
  
    mandatory_errors = pipeline.named_steps["mandatory_columns"].get_errors()
    if mandatory_errors is not None:
        pd.DataFrame(mandatory_errors, columns=["Missing Mandatory Columns"]).to_excel(
            writer, sheet_name="Missing Mandatory Columns"
        )
    else:
        pd.DataFrame(columns=["Missing Mandatory Columns"]).to_excel(
            writer, sheet_name="Missing Mandatory Columns", index=False
        )
    
  
    missing_loan_id = pipeline.named_steps["check_missing_loan_id"].transform(df_cleaned)
    if missing_loan_id is not None and not missing_loan_id.empty:
        missing_loan_id.to_excel(writer, sheet_name="Missing Loan IDs")
    else:
        pd.DataFrame(columns=["Missing Loan IDs"]).to_excel(
            writer, sheet_name="Missing Loan IDs", index=False
        )
        
    missing_values = pipeline.named_steps["check_missing_values"].errors
    if missing_values is not None and not missing_values.empty:
        missing_values.to_excel(writer, sheet_name="Missing Values")
    else:
        pd.DataFrame(columns=["Column", "Missing Values"]).to_excel(
            writer, sheet_name="Missing Values", index=False
        )
   
    invalid_dates = pipeline.named_steps["date_converter"].errors
    if invalid_dates is not None and not invalid_dates.empty:
        invalid_dates.to_excel(writer, sheet_name="Invalid Date Conversion")
    else:
        pd.DataFrame(columns=["Invalid Dates"]).to_excel(
            writer, sheet_name="Invalid Date Conversion", index=False
        )
    
   
    numeric_conversion_issues = pipeline.named_steps["convert_numeric"].transform(df_cleaned)
    if numeric_conversion_issues is not None and not numeric_conversion_issues.empty:
        numeric_conversion_issues.to_excel(
            writer, sheet_name="Numeric Conversion Issues"
        )
    else:
        pd.DataFrame(columns=["Numeric Conversion Issues"]).to_excel(
            writer, sheet_name="Numeric Conversion Issues", index=False
        )

    negative_amounts_and_zeros = pipeline.named_steps[
        "check_negative_amounts_and_zeros"
    ].transform(df_cleaned)
    if negative_amounts_and_zeros is not None and not negative_amounts_and_zeros.empty:
        negative_amounts_and_zeros.to_excel(
            writer, sheet_name="Negative Amounts and Zeros"
        )
    else:
        pd.DataFrame(columns=["Negative Amounts and Zeros"]).to_excel(
            writer, sheet_name="Negative Amounts and Zeros", index=False
        )
    
    duplicates = pipeline.named_steps["check_duplicates"].transform(df_cleaned)
    if duplicates is not None and not duplicates.empty:
        duplicates.to_excel(writer, sheet_name="Duplicates")
    else:
        pd.DataFrame(columns=["Duplicates"]).to_excel(
            writer, sheet_name="Duplicates", index=False
        )


