In [214]:
import pandas as pd
import numpy as np
from faker import Faker
import random
import openpyxl
from sklearn.pipeline import Pipeline
from datetime import datetime, timedelta
from sklearn.base import BaseEstimator, TransformerMixin

# **Generate Fake Data**

In [215]:
def random_date(start, end):
    return start + timedelta(days=random.randint(0, int((end - start).days)))


np.random.seed(42)
num_records = 10000

loan_ids = np.arange(1, num_records + 1)
disbursement_dates = [
    random_date(datetime(2020, 1, 1), datetime(2023, 1, 1)) for _ in range(num_records)
]
expire_dates = [d + timedelta(days=random.randint(30, 365)) for d in disbursement_dates]

is_employed = np.random.choice([True, False, np.nan], num_records, p=[0.7, 0.25, 0.05])
loan_amounts = np.random.choice(
    [np.nan, *np.random.uniform(1000, 50000, num_records)], num_records
)
number_of_defaults = np.random.choice(
    [np.nan, *np.random.randint(0, 10, num_records)], num_records
)
outstanding_balances = np.random.choice(
    [np.nan, *np.random.uniform(0, 30000, num_records)], num_records
)
interest_rates = np.random.choice(
    [np.nan, *np.random.uniform(0.01, 0.25, num_records)], num_records
)
ages = np.random.choice([np.nan, *np.random.randint(18, 65, num_records)], num_records)
remaining_terms = np.random.choice(
    [np.nan, *np.random.randint(1, 60, num_records)], num_records
)
salaries = np.random.choice(
    [np.nan, *np.random.uniform(1000, 20000, num_records)], num_records
)
loan_statuses = np.random.choice(
    ["Default", "Non-Default", np.nan], num_records, p=[0.3, 0.65, 0.05]
)
sectors = np.random.choice(
    ["Agriculture", "Manufacturing", "Services", "IT", "Retail"], num_records
)
currencies = np.random.choice(["USD", "EUR", "ZWL", "GBP", "AUD"], num_records)
employee_sectors = np.random.choice(
    ["Public", "Private", "Self-employed", "Unemployed"], num_records
)
statuses = np.random.choice(["Active", "Inactive"], num_records)

loan_data = pd.DataFrame(
    {
        "loan_id": loan_ids,
        "disbursement_date": disbursement_dates,
        "expire_date": expire_dates,
        "is_employed": is_employed,
        "loan_amount": loan_amounts,
        "number_of_defaults": number_of_defaults,
        "outstanding_balance": outstanding_balances,
        "interest_rate": interest_rates,
        "age": ages,
        "remaining_term": remaining_terms,
        "salary": salaries,
        "sector": sectors,
        "currency": currencies,
        "employee_sector": employee_sectors,
        "status": statuses,
        "loan_status": loan_statuses,
    }
)

In [216]:
loan_data.head(3)

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,employee_sector,status,loan_status
0,1,2021-06-13,2021-12-25,1.0,29806.398905,4.0,29869.899084,0.11234,38.0,49.0,10442.103846,Agriculture,USD,Unemployed,Active,Non-Default
1,2,2022-06-11,2023-02-08,,28348.573748,5.0,27408.000182,0.128941,30.0,30.0,8779.630547,Agriculture,EUR,Unemployed,Active,Non-Default
2,3,2021-03-10,2021-05-05,0.0,25470.384359,2.0,19078.8504,0.224119,63.0,17.0,9925.697326,Services,USD,Private,Inactive,Non-Default


In [217]:
loan_data.shape

(10000, 16)

# Data Cleanign steps

- Data cleaning is the process of ensuring that data is in the proper format, *making it suitable for analysis and modeling*

### 1.Check missing values 

In [218]:
loan_data.isnull().sum()

loan_id                  0
disbursement_date        0
expire_date              0
is_employed            474
loan_amount              1
number_of_defaults       1
outstanding_balance      2
interest_rate            1
age                      1
remaining_term           2
salary                   0
sector                   0
currency                 0
employee_sector          0
status                   0
loan_status              0
dtype: int64

In [219]:
category_columns = loan_data.select_dtypes("number").columns
category_columns

Index(['loan_id', 'is_employed', 'loan_amount', 'number_of_defaults',
       'outstanding_balance', 'interest_rate', 'age', 'remaining_term',
       'salary'],
      dtype='object')

In [220]:
category_columns = loan_data.select_dtypes("object").columns
category_columns

Index(['sector', 'currency', 'employee_sector', 'status', 'loan_status'], dtype='object')

In [221]:
data = loan_data.copy()

In [222]:
duplicates = data.loc[data.duplicated(keep=False)].sort_values("loan_id")
duplicates

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,employee_sector,status,loan_status


In [223]:
def duplicates_check(df: pd.DataFrame):
    return data.loc[data.duplicated(keep=False)].sort_values("loan_id")

In [224]:
num_columns = [
    "loan_amount",
    "number_of_defaults",
    "outstanding_balance",
    "interest_rate",
    "age",
    "remaining_term",
    "salary",
]

In [225]:
data.head()

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,employee_sector,status,loan_status
0,1,2021-06-13,2021-12-25,1.0,29806.398905,4.0,29869.899084,0.11234,38.0,49.0,10442.103846,Agriculture,USD,Unemployed,Active,Non-Default
1,2,2022-06-11,2023-02-08,,28348.573748,5.0,27408.000182,0.128941,30.0,30.0,8779.630547,Agriculture,EUR,Unemployed,Active,Non-Default
2,3,2021-03-10,2021-05-05,0.0,25470.384359,2.0,19078.8504,0.224119,63.0,17.0,9925.697326,Services,USD,Private,Inactive,Non-Default
3,4,2022-08-06,2023-05-24,1.0,36758.859229,9.0,5269.500606,0.125747,30.0,7.0,16044.997132,Retail,GBP,Private,Active,Default
4,5,2021-09-02,2021-12-08,1.0,45535.484579,6.0,22912.296526,0.142642,40.0,10.0,16256.6628,Retail,EUR,Self-employed,Inactive,Default


In [226]:
def check_missing_values(df:pd.DataFrame):
    return data.loc[data.isnull().any(axis=1)]

In [227]:
class CheckMissingValues(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.errors = pd.DataFrame()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        missing_counts = X.isnull().sum()
        missing_counts = missing_counts[missing_counts > 0]
        self.errors = pd.DataFrame({
            'Column': missing_counts.index,
            'Missing Values': missing_counts.values
        })
        
        return self.errors

In [228]:
missing = CheckMissingValues()
frms = missing.fit_transform(data)
frms

Unnamed: 0,Column,Missing Values
0,is_employed,474
1,loan_amount,1
2,number_of_defaults,1
3,outstanding_balance,2
4,interest_rate,1
5,age,1
6,remaining_term,2


In [229]:
check_missing_values = data.loc[data.isnull().any(axis=1)]
check_missing_values

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,employee_sector,status,loan_status
1,2,2022-06-11,2023-02-08,,28348.573748,5.0,27408.000182,0.128941,30.0,30.0,8779.630547,Agriculture,EUR,Unemployed,Active,Non-Default
11,12,2022-11-20,2023-06-05,,11471.935768,1.0,4842.795849,0.192387,27.0,38.0,5981.078707,IT,EUR,Unemployed,Inactive,Non-Default
34,35,2022-01-27,2022-09-30,,12458.570085,3.0,14616.721703,0.189646,24.0,24.0,8985.982774,Manufacturing,EUR,Public,Inactive,Non-Default
50,51,2020-11-28,2021-06-08,,34742.835205,0.0,9980.403155,0.243796,39.0,2.0,19021.746320,Agriculture,AUD,Self-employed,Active,Default
69,70,2022-05-12,2022-12-25,,24528.498651,7.0,15711.060298,0.209259,27.0,14.0,3521.346822,IT,USD,Private,Inactive,Non-Default
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9851,9852,2022-02-02,2022-12-25,,6735.032652,5.0,26000.942849,0.073461,44.0,59.0,8351.716821,Manufacturing,USD,Unemployed,Inactive,Non-Default
9927,9928,2021-07-20,2022-06-14,,37890.602781,6.0,2531.808990,0.090598,55.0,13.0,16555.449232,Retail,GBP,Unemployed,Inactive,
9971,9972,2022-08-05,2023-08-01,,15641.350140,9.0,10288.396845,0.160507,36.0,40.0,5218.878111,IT,ZWL,Self-employed,Active,Non-Default
9972,9973,2020-01-13,2020-06-05,,28699.159038,4.0,12616.283408,0.231205,23.0,32.0,19874.948375,IT,GBP,Unemployed,Active,Non-Default


In [230]:
def check_invalid_dates(df: pd.DataFrame):
    dates_invalid = df[df["disbursement_date"] > df["expire_date"]]
    return dates_invalid

In [231]:
dates_invalid = data[data["disbursement_date"] > data["expire_date"]]
dates_invalid

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,employee_sector,status,loan_status


In [232]:
mandatory_columns = ['loan_id', 'disbursement_date', 'expire_date', 'is_employed',
       'loan_amount', 'number_of_defaults', 'outstanding_balance',
       'interest_rate', 'age', 'remaining_term', 'salary', 'sector',
       'currency', 'employee_sector', 'status', 'loan_status']

In [233]:
def check_mandatory_columns(df: pd.DataFrame, mandatory_columns: list) -> list:
    missing_columns = [col for col in mandatory_columns if col not in df.columns]
    if missing_columns:
        return missing_columns
    else:
        return None



In [234]:
class MandatoryColumns(BaseEstimator, TransformerMixin):
    def __init__(self, mandatory_columns):
        self.mandatory_columns = mandatory_columns
        self.errors = None
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        missing_columns = [col for col in self.mandatory_columns if col not in X.columns]
        if missing_columns:
            self.errors = missing_columns
        else:
            self.errors = []
        return X 

    def get_errors(self):
        return self.errors


In [235]:
num_columns_ck = ["loan_amount", "interest_rate", "age", "salary"]

In [236]:
conditions = [(data[col] < 0) | (data[col] == 0) for col in num_columns_ck]
mask = pd.concat(conditions, axis=1).any(axis=1)
check_negative_amounts_and_zeros = data[mask]
check_negative_amounts_and_zeros

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,employee_sector,status,loan_status


In [237]:
def check_negative_amounts_and_zeros(df: pd.DataFrame):
    conditions = [(data[col] < 0) | (data[col] == 0) for col in num_columns_ck]
    mask = pd.concat(conditions, axis=1).any(axis=1)
    negative_amounts_and_zeros = data[mask]
    negative_amounts_and_zeros
    return negative_amounts_and_zeros

In [238]:
data[num_columns] = data[num_columns].apply(pd.to_numeric, errors="coerce")
not_converted_num = data.loc[data[num_columns].isnull().any(axis=1)]

In [239]:
def converted_num(df: pd.DataFrame):
    num_columns = [
        "loan_amount",
        "number_of_defaults",
        "outstanding_balance",
        "interest_rate",
        "age",
        "remaining_term",
        "salary",
    ]
    data[num_columns] = data[num_columns].apply(pd.to_numeric, errors="coerce")
    not_converted_num = data.loc[data[num_columns].isnull().any(axis=1)]
    return not_converted_num

In [240]:
not_converted_num

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,employee_sector,status,loan_status
427,428,2022-07-29,2022-11-20,0.0,49471.119764,,18552.233004,0.043065,35.0,25.0,19443.536847,IT,USD,Private,Active,Non-Default
1159,1160,2022-12-08,2023-03-22,1.0,31020.40111,3.0,4089.793095,,30.0,28.0,3512.799787,Agriculture,AUD,Private,Inactive,Non-Default
2495,2496,2021-05-11,2021-06-18,1.0,18638.114719,1.0,6242.926831,0.194841,51.0,,12872.841341,Agriculture,GBP,Unemployed,Active,Default
2498,2499,2022-07-05,2022-09-04,,48799.034418,6.0,,0.02808,27.0,6.0,12089.302714,Retail,AUD,Private,Inactive,Default
2837,2838,2021-05-17,2022-05-05,1.0,24803.421603,4.0,25581.870683,0.167046,31.0,,1100.505483,Retail,AUD,Unemployed,Inactive,Non-Default
3241,3242,2022-02-21,2022-08-26,1.0,12499.493111,4.0,22488.176453,0.21383,,11.0,11721.748924,Retail,AUD,Unemployed,Active,Non-Default
5483,5484,2022-07-21,2022-09-20,0.0,47505.757357,2.0,,0.040812,44.0,7.0,14875.903005,Manufacturing,EUR,Private,Active,Non-Default
7381,7382,2020-09-13,2021-02-09,1.0,,6.0,16856.131988,0.226779,32.0,55.0,19668.166451,Retail,EUR,Self-employed,Active,Default


In [241]:
dates_columns = loan_data.filter(regex="date").columns
category_columns = loan_data.select_dtypes("object").columns

In [242]:
for column in dates_columns:
    data[column] = pd.to_datetime(data[column], format="%d/%m/%Y", errors="coerce")

In [243]:
not_converted_dates = data.loc[data[dates_columns].isnull().any(axis=1)]
not_converted_dates

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,employee_sector,status,loan_status


In [244]:
def converted_dates(df: pd.DataFrame):
    dates_columns = df.filter(regex="date").columns
    for column in dates_columns:
        df[column] = pd.to_datetime(df[column], format="%d/%m/%Y", errors="coerce")
        not_converted_dates = data.loc[data[dates_columns].isnull().any(axis=1)]
        return not_converted_dates

In [245]:
class DateConverter(BaseEstimator, TransformerMixin):
    def __init__(self, date_formats=None):
        self.errors = None
        self.date_formats = (
            date_formats
            if date_formats is not None
            else ["%d/%m/%Y", "%Y-%m-%d", "%m/%d/%Y", "%d-%m-%Y", "%Y.%m.%d"]
        )

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        dates_columns = X.filter(regex="date").columns
        X_temp = X.copy()
        not_converted_dates = pd.DataFrame()

        for date_format in self.date_formats:
            try:
                X_temp[dates_columns] = X_temp[dates_columns].apply(
                    pd.to_datetime, format=date_format, errors="coerce"
                )
            except Exception:
                continue
            if X_temp[dates_columns].isnull().any().any():
                not_converted_dates = X_temp[X_temp[dates_columns].isnull().any(axis=1)]
                if not not_converted_dates.empty:
                    break
            else:
                break

        return not_converted_dates

In [246]:
class ConvertedNumeric(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X):
        return self

    def transform(self, X):
        X = data.copy()
        num_columns = [
            "loan_amount",
            "number_of_defaults",
            "outstanding_balance",
            "interest_rate",
            "age",
            "remaining_term",
            "salary",
        ]
        X[num_columns] = X[num_columns].apply(pd.to_numeric, errors="coerce")
        not_converted_num = X.loc[X[num_columns].isnull().any(axis=1)]
        return not_converted_num

In [247]:
class CheckNegativeAmountsAndZerosAmounts(BaseEstimator, TransformerMixin):
    def __init__(self, num_columns_ck=None):
        if num_columns_ck is None:
            num_columns_ck = ["loan_amount", "interest_rate", "age", "salary"]
        self.num_columns_ck = num_columns_ck
        self.errors = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            raise ValueError
        missing_cols = [col for col in self.num_columns_ck if col not in X.columns]
        if missing_cols:
            raise ValueError(f"Missing columns: {', '.join(missing_cols)}")
        conditions = [(X[col] < 0) | (X[col] == 0) for col in self.num_columns_ck]
        mask = pd.concat(conditions, axis=1).any(axis=1)
        negative_amounts_and_zeros = X[mask]
        return negative_amounts_and_zeros

In [248]:
class CheckDuplicates(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.errors = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            raise ValueError
        duplicates = X.loc[X.duplicated(keep=False)].sort_values("loan_id")
        return duplicates

In [250]:
class CheckInvalidDates(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.errors = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        dates_columns = X[X["disbursement_date"] > X["expire_date"]]
        self.errors = dates_columns
        return dates_columns

In [251]:
pipeline = Pipeline([
    ('mandatory_columns', MandatoryColumns(mandatory_columns=mandatory_columns)),
    ('check_missing_values', CheckMissingValues()),
    ('date_converter', DateConverter()),
    #('check_invalid_dates', CheckInvalidDates()),
    ('convert_numeric', ConvertedNumeric()),
    ('check_negative_amounts_and_zeros', CheckNegativeAmountsAndZerosAmounts()),
    ('check_duplicates', CheckDuplicates()),
])


In [253]:
df_cleaned = data.copy()
pipeline.fit(df_cleaned)
pipeline.transform(df_cleaned)

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,sector,currency,employee_sector,status,loan_status


In [261]:
with pd.ExcelWriter('data_issues.xlsx') as writer:
    mandatory_errors = pipeline.named_steps['mandatory_columns'].get_errors()
    if mandatory_errors is not None:
        pd.DataFrame(mandatory_errors, columns=['Missing Mandatory Columns']).to_excel(writer, sheet_name='Missing Mandatory Columns')
    else:
        pd.DataFrame(columns=['Missing Mandatory Columns']).to_excel(writer, sheet_name='Missing Mandatory Columns',index=False)

    missing_values = pipeline.named_steps['check_missing_values'].errors
    if missing_values is not None and not missing_values.empty:
        missing_values.to_excel(writer, sheet_name='Missing Values')
    else:
        pd.DataFrame(columns=['Column', 'Missing Values']).to_excel(writer, sheet_name='Missing Values',index=False)

    invalid_dates = pipeline.named_steps['date_converter'].errors
    if invalid_dates is not None and not invalid_dates.empty:
        invalid_dates.to_excel(writer, sheet_name='Invalid Date Conversion')
    else:
        pd.DataFrame(columns=['Invalid Dates']).to_excel(writer, sheet_name='Invalid Date Conversion',index=False)

    numeric_conversion_issues = pipeline.named_steps['convert_numeric'].transform(df_cleaned)
    if numeric_conversion_issues is not None and not numeric_conversion_issues.empty:
        numeric_conversion_issues.to_excel(writer, sheet_name='Numeric Conversion Issues')
    else:
        pd.DataFrame(columns=['Numeric Conversion Issues']).to_excel(writer, sheet_name='Numeric Conversion Issues',index=False)

    negative_amounts_and_zeros = pipeline.named_steps['check_negative_amounts_and_zeros'].transform(df_cleaned)
    if negative_amounts_and_zeros is not None and not negative_amounts_and_zeros.empty:
        negative_amounts_and_zeros.to_excel(writer, sheet_name='Negative Amounts and Zeros')
    else:
        pd.DataFrame(columns=['Negative Amounts and Zeros']).to_excel(writer, sheet_name='Negative Amounts and Zeros',index=False)

    duplicates = pipeline.named_steps['check_duplicates'].transform(df_cleaned)
    if duplicates is not None and not duplicates.empty:
        duplicates.to_excel(writer, sheet_name='Duplicates')
    else:
        pd.DataFrame(columns=['Duplicates']).to_excel(writer, sheet_name='Duplicates',index=False)
