In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv('3_credit_data.csv')

# 1. Verify data consistency
missing_values = data.isnull().sum()
duplicate_values = data.duplicated().sum()
print('Counting missing values:')
print(missing_values)
print('Counting duplicated values:')
print(duplicate_values)

# 2. Verify data is reasonable
data['is_age_valid'] = data['Age'].between(18,70)
data['is_income_valid'] = data['Income'] > 2000
data['is_loan_amount_valid'] = data['LoanAmount'] < (data['Income']*5)
data['is_credit_score_valid'] = data['CreditScore'].between(300, 850)
to_be_verified_data_labels = ['is_age_valid', 'is_income_valid', 'is_loan_amount_valid', 'is_credit_score_valid']
validity_checks = data[to_be_verified_data_labels].all(axis=1)
data['is_valid'] = validity_checks
print('Verify data is reasonable')
print(data[to_be_verified_data_labels].describe())

Counting missing values:
CustomerID            0
Name                  0
Age                   1
Income                1
LoanAmount            0
LoanTerm              0
CreditScore           0
Default               0
TransactionHistory    0
dtype: int64
Counting duplicated values:
0
Verify data is reasonable
       is_age_valid is_income_valid is_loan_amount_valid is_credit_score_valid
count          1000            1000                 1000                  1000
unique            2               2                    2                     1
top            True            True                 True                  True
freq            999             999                  796                  1000


In [5]:
# 3. Data cleaning and abnormal value handling
invalid_rows = data[~data['is_valid']]
cleaned_data = data[data['is_valid']]
cleaned_data = cleaned_data.drop(columns=to_be_verified_data_labels)
cleaned_data.to_csv('3_credit_data_cleaned.csv', index=False)
print("Data cleaning complete. Saved as '3_credit_data_cleaned.csv'")

Data cleaning complete. Saved as '3_credit_data_cleaned.csv'
