In [90]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

df = pd.read_csv('all_data.csv', delimiter=';')
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

# drop Name and SSN columns
df = df.drop(['Name', 'SSN'], axis=1)

# Replace all empty cells with NaN in the entire dataframe
df.replace([None], np.nan, inplace=True)

  df = pd.read_csv('all_data.csv', delimiter=';')


In [None]:
# This was run to create the output.txt file. No need to run it again.
"""
columns_list = df.columns.tolist()
columns_list.remove('Customer_ID')
columns_list.remove("ID")
columns_list.remove("Month")

for column in columns_list:
    # Iterate over each unique Customer_ID
    for customer_id in df['Customer_ID'].unique():
        # Get unique Annual_Income values for this Customer_ID
        unique_incomes = df[df['Customer_ID'] == customer_id][column].unique()

        # Check if there are more than one unique values
        if len(unique_incomes) > 4:
            print(f"Customer ID: {customer_id}, {column}: {unique_incomes}")"""

In [91]:
# Order entries by cutomer and month, and then reset index
month_to_num = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 
                'June': 6, 'July': 7, 'August': 8, 'September': 9, 
                'October': 10, 'November': 11, 'December': 12}

df["temp_Month"] = df["Month"].map(month_to_num)
df.sort_values(["Customer_ID", "temp_Month"], inplace=True)
df.drop("temp_Month", axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

In [92]:
# All columns with numeric values
numeric_column_names = ["Age", "Annual_Income", "Monthly_Inhand_Salary", "Num_Bank_Accounts", 
"Num_Credit_Card", "Interest_Rate", "Num_of_Loan", "Delay_from_due_date", 
"Num_of_Delayed_Payment", "Changed_Credit_Limit", "Num_Credit_Inquiries", 
"Outstanding_Debt", "Credit_Utilization_Ratio", "Total_EMI_per_month", 
"Amount_invested_monthly", "Monthly_Balance"]

# All columns with non-negative numeric values
non_negative_column_names = numeric_column_names.copy()
non_negative_column_names.remove("Delay_from_due_date")
non_negative_column_names.remove("Changed_Credit_Limit")

In [93]:
# Clean columns from unwanted characters 
special_chars = set()
special_char_regex = r'[^0-9]'
for column in numeric_column_names:
    # Convert to string and perform replacements
    df[column] = df[column].astype(str).str.replace(',', '.').str.replace('_', '')

    # Extract special characters from each cell and update the set
    df[column].dropna().str.findall(special_char_regex).apply(special_chars.update)

    # Convert to numeric and round to 2 decimals
    df[column] = pd.to_numeric(df[column], errors='coerce').round(2)

# Clean the Amount_invested_monthly column from an obvious error value.
df['Amount_invested_monthly'] = df['Amount_invested_monthly'].replace(10000, np.nan)

In [86]:
# This function is used to find the upper bounds of the columns.
def highest_values(column_name):
    """
    This function will give you the 10 highest values of a column where 
    the value appears more than once for a customer.
    This will identify the highest values that are most likely not error values.
    """
    income_counts = df.groupby(['Customer_ID', column_name]).size().reset_index(name='count')
    income_counts_filtered = income_counts[income_counts['count'] >= 1]
    income_counts_filtered_sorted = income_counts_filtered.sort_values(column_name, ascending=True)
    top_10_incomes = income_counts_filtered_sorted.head(10)
    print(top_10_incomes)

highest_values("Monthly_Balance")

       Customer_ID  Monthly_Balance  count
58302   CUS_0x5a90    -3.333333e+26      1
39217   CUS_0x4379    -3.333333e+26      1
126102  CUS_0xaebc    -3.333333e+26      1
93117   CUS_0x85e9    -3.333333e+26      1
16629   CUS_0x26b1    -3.333333e+26      1
20360   CUS_0x2b77    -3.333333e+26      1
108145  CUS_0x9885    -3.333333e+26      1
73367   CUS_0x6c9e    -3.333333e+26      1
23743   CUS_0x2f7e    -3.333333e+26      1
56362   CUS_0x57f3    -3.333333e+26      1


In [94]:
# Replace negative values with the mode (of the positive) values of the customer
for column in non_negative_column_names:
    mode_values = df.groupby('Customer_ID')[column].transform(lambda x: x[x >= 0].mode()[0])
    df.loc[df[column] < 0, column] = mode_values[df[column] < 0]

In [95]:
# Replace values above the upper limit with the mode value of the customer
upper_limit_dict = {"Age": 56, "Annual_Income": 179987.28, "Monthly_Inhand_Salary": 15204.63,
                  "Num_Bank_Accounts": 11, "Num_Credit_Card": 11, "Interest_Rate": 34,
                  "Num_of_Loan": 9, "Delay_from_due_date": 67, "Num_of_Delayed_Payment": 28,
                  "Changed_Credit_Limit": 34.21, "Num_Credit_Inquiries": 17, "Outstanding_Debt": 4998.07,
                  "Credit_Utilization_Ratio": 43.06, "Total_EMI_per_month": 1841.35}

for column, limit in upper_limit_dict.items():
    # Calculate the mode value for each customer_id in the specified column
    mode_values = df.groupby('Customer_ID')[column].transform(lambda x: x.mode()[0])
    # Replace values above the limit with the corresponding median value
    df.loc[df[column] > limit, column] = mode_values[df[column] > limit]

In [None]:
df.to_csv('cleaned_data.csv', index=False)

In [78]:
for column in non_negative_column_names:
    print(f"{column}_min: {df[column].min()}")
    print(f"{column}_max: {df[column].max()}")

Age_min: 14.0
Age_max: 56.0
Annual_Income_min: 7005.93
Annual_Income_max: 179987.28
Monthly_Inhand_Salary_min: 303.65
Monthly_Inhand_Salary_max: 15204.63
Num_Bank_Accounts_min: 0.0
Num_Bank_Accounts_max: 11.0
Num_Credit_Card_min: 0.0
Num_Credit_Card_max: 11.0
Interest_Rate_min: 1.0
Interest_Rate_max: 34.0
Num_of_Loan_min: 0.0
Num_of_Loan_max: 9.0
Num_of_Delayed_Payment_min: 0.0
Num_of_Delayed_Payment_max: 28.0
Num_Credit_Inquiries_min: 0.0
Num_Credit_Inquiries_max: 17.0
Outstanding_Debt_min: 0.23
Outstanding_Debt_max: 4998.07
Credit_Utilization_Ratio_min: 20.0
Credit_Utilization_Ratio_max: 43.06
Total_EMI_per_month_min: 0.0
Total_EMI_per_month_max: 1841.35
Amount_invested_monthly_min: 0.0
Amount_invested_monthly_max: 1977.33
Monthly_Balance_min: 0.01
Monthly_Balance_max: 1606.52


In [96]:
for column in non_negative_column_names:
    print(f"{column}_min: {df[column].min()}")
    print(f"{column}_max: {df[column].max()}")

Age_min: 14
Age_max: 56
Annual_Income_min: 7005.93
Annual_Income_max: 179987.28
Monthly_Inhand_Salary_min: 303.65
Monthly_Inhand_Salary_max: 15204.63
Num_Bank_Accounts_min: 0
Num_Bank_Accounts_max: 11
Num_Credit_Card_min: 0
Num_Credit_Card_max: 11
Interest_Rate_min: 1
Interest_Rate_max: 34
Num_of_Loan_min: 0
Num_of_Loan_max: 9
Num_of_Delayed_Payment_min: 0.0
Num_of_Delayed_Payment_max: 28.0
Num_Credit_Inquiries_min: 0.0
Num_Credit_Inquiries_max: 17.0
Outstanding_Debt_min: 0.23
Outstanding_Debt_max: 4998.07
Credit_Utilization_Ratio_min: 20.0
Credit_Utilization_Ratio_max: 43.06
Total_EMI_per_month_min: 0.0
Total_EMI_per_month_max: 1841.35
Amount_invested_monthly_min: 0.0
Amount_invested_monthly_max: 1977.33
Monthly_Balance_min: 0.01
Monthly_Balance_max: 1606.52


*Below are the cells for 

In [None]:
df['Annual_Income'] = df['Annual_Income'].astype(float)
percentile_95 = df['Annual_Income'].quantile(0.95)
df['Annual_Income'] = df['Annual_Income'].where(df['Annual_Income'] <= percentile_95, np.nan)

In [None]:
# Clean Age column
df['test_Age'] = df['Age'].astype(str)
df['test_Age'] = df['test_Age'].str.replace('\d+', '', regex=True)
print(df["test_Age"].value_counts())
df.drop("test_Age", axis=1, inplace=True)

df['Age'] = df['Age'].str.replace('\D+', '', regex=True)

df['test_Age'] = df['Age'].astype(str)
df['test_Age'] = df['test_Age'].str.replace('\d+', '', regex=True)
print(df["test_Age"].value_counts())
df.drop("test_Age", axis=1, inplace=True)

In [None]:
# Check Monthly_Inhand_Salary column
df['test_Monthly_Inhand_Salary'] = df['Monthly_Inhand_Salary'].astype(str)
df['test_Monthly_Inhand_Salary'] = df['test_Monthly_Inhand_Salary'].str.replace('\d+', '', regex=True)
print(df["test_Monthly_Inhand_Salary"].value_counts())
print(df[df["test_Monthly_Inhand_Salary"] == ""])
df.drop("test_Monthly_Inhand_Salary", axis=1, inplace=True)

# This column seems clean, just some missing values that we might be able to fill in later

In [None]:
# Check num_bank_accounts column
df['test_Num_Bank_Accounts'] = df['Num_Bank_Accounts'].astype(str)
df['test_Num_Bank_Accounts'] = df['test_Num_Bank_Accounts'].str.replace('\d+', '', regex=True)
print(df["test_Num_Bank_Accounts"].value_counts())
df.drop("test_Num_Bank_Accounts", axis=1, inplace=True)

# Replace negative values with NaN
df.loc[df['Num_Bank_Accounts'] < 0, 'Num_Bank_Accounts'] = np.nan

df['test_Num_Bank_Accounts'] = df['Num_Bank_Accounts'].astype(str)
df['test_Num_Bank_Accounts'] = df['test_Num_Bank_Accounts'].str.replace('\d+', '', regex=True)
print(df["test_Num_Bank_Accounts"].value_counts())
df.drop("test_Num_Bank_Accounts", axis=1, inplace=True)

In [None]:
# Check Num_Credit_Card column
df['test_Num_Credit_Card'] = df['Num_Credit_Card'].astype(str)
df['test_Num_Credit_Card'] = df['test_Num_Credit_Card'].str.replace('\d+', '', regex=True)
print(df["test_Num_Credit_Card"].value_counts())
df.drop("test_Num_Credit_Card", axis=1, inplace=True)

# No negative values
print(df["Num_Credit_Card"].max())

In [None]:
# Check Num_of_Loan column
df['test_Num_of_Loan'] = df['Num_of_Loan'].astype(str)
df['test_Num_of_Loan'] = df['test_Num_of_Loan'].str.replace('\d+', '', regex=True)
print(df["test_Num_of_Loan"].value_counts())
df.drop("test_Num_of_Loan", axis=1, inplace=True)

df['Num_of_Loan'] = df['Num_of_Loan'].str.replace('_', '')
df['Num_of_Loan'] = df['Num_of_Loan'].astype(float)
df.loc[df['Num_of_Loan'] < 0, 'Num_of_Loan'] = np.nan

df['test_Num_of_Loan'] = df['Num_of_Loan'].astype(str)
df['test_Num_of_Loan'] = df['test_Num_of_Loan'].str.replace('\d+', '', regex=True)
print(df["test_Num_of_Loan"].value_counts())
df.drop("test_Num_of_Loan", axis=1, inplace=True)

In [None]:
# Check Delay_from_due_date column
df['test_Delay_from_due_date'] = df['Delay_from_due_date'].astype(str)
df['test_Delay_from_due_date'] = df['test_Delay_from_due_date'].str.replace('\d+', '', regex=True)
print(df["test_Delay_from_due_date"].value_counts())
df.drop("test_Delay_from_due_date", axis=1, inplace=True)

#df['Delay_from_due_date'] = df['Delay_from_due_date'].astype(float)
#df.loc[df['Delay_from_due_date'] < 0, 'Delay_from_due_date'] = np.nan
# In this column, negative values might have a meaning

df['test_Delay_from_due_date'] = df['Delay_from_due_date'].astype(str)
df['test_Delay_from_due_date'] = df['test_Delay_from_due_date'].str.replace('\d+', '', regex=True)
print(df["test_Delay_from_due_date"].value_counts())
df.drop("test_Delay_from_due_date", axis=1, inplace=True)

In [None]:
# Check Num_of_Delayed_Payment column
df['test_Num_of_Delayed_Payment'] = df['Num_of_Delayed_Payment'].astype(str)
df['test_Num_of_Delayed_Payment'] = df['test_Num_of_Delayed_Payment'].str.replace('\d+', '', regex=True)
print(df["test_Num_of_Delayed_Payment"].value_counts())
df.drop("test_Num_of_Delayed_Payment", axis=1, inplace=True)

df['Num_of_Delayed_Payment'] = df['Num_of_Delayed_Payment'].str.replace('_', '')
df['Num_of_Delayed_Payment'] = df['Num_of_Delayed_Payment'].astype(float)
df.loc[df['Num_of_Delayed_Payment'] < 0, 'Num_of_Delayed_Payment'] = np.nan


df['test_Num_of_Delayed_Payment'] = df['Num_of_Delayed_Payment'].astype(str)
df['test_Num_of_Delayed_Payment'] = df['test_Num_of_Delayed_Payment'].str.replace('\d+', '', regex=True)
print(df["test_Num_of_Delayed_Payment"].value_counts())
df.drop("test_Num_of_Delayed_Payment", axis=1, inplace=True)

In [None]:
df['test_Changed_Credit_Limit'] = df['Changed_Credit_Limit'].astype(str)
df['test_Changed_Credit_Limit'] = df['test_Changed_Credit_Limit'].str.replace('\d+', '', regex=True)
print(df["test_Changed_Credit_Limit"].value_counts())
df.drop("test_Changed_Credit_Limit", axis=1, inplace=True)

df['Changed_Credit_Limit'] = df['Changed_Credit_Limit'].str.replace('_', '')
df["Changed_Credit_Limit"].replace([None, '', np.nan], np.nan, inplace=True)

df['test_Changed_Credit_Limit'] = df['Changed_Credit_Limit'].astype(str)
df['test_Changed_Credit_Limit'] = df['test_Changed_Credit_Limit'].str.replace('\d+', '', regex=True)
print(df["test_Changed_Credit_Limit"].value_counts())
df.drop("test_Changed_Credit_Limit", axis=1, inplace=True)

In [None]:
df['test_Outstanding_Debt'] = df['Outstanding_Debt'].astype(str)
df['test_Outstanding_Debt'] = df['test_Outstanding_Debt'].str.replace('\d+', '', regex=True)
print(df["test_Outstanding_Debt"].value_counts())
df.drop("test_Outstanding_Debt", axis=1, inplace=True)

df['Outstanding_Debt'] = df['Outstanding_Debt'].str.replace('.', ',')
df['Outstanding_Debt'] = df['Outstanding_Debt'].str.replace('_', '')

df['test_Outstanding_Debt'] = df['Outstanding_Debt'].astype(str)
df['test_Outstanding_Debt'] = df['test_Outstanding_Debt'].str.replace('\d+', '', regex=True)
print(df["test_Outstanding_Debt"].value_counts())
df.drop("test_Outstanding_Debt", axis=1, inplace=True)

In [None]:
df['test_Credit_Utilization_Ratio'] = df['Credit_Utilization_Ratio'].astype(str)
df['test_Credit_Utilization_Ratio'] = df['test_Credit_Utilization_Ratio'].str.replace('\d+', '', regex=True)
print(df["test_Credit_Utilization_Ratio"].value_counts())
df.drop("test_Credit_Utilization_Ratio", axis=1, inplace=True)

# "Empty" values are whole numbers, so that's fine

In [None]:
df['test_Payment_of_Min_Amount'] = df['Payment_of_Min_Amount'].astype(str)
df['test_Payment_of_Min_Amount'] = df['test_Payment_of_Min_Amount'].str.replace('\d+', '', regex=True)
print(df["test_Payment_of_Min_Amount"].value_counts())
df.drop("test_Payment_of_Min_Amount", axis=1, inplace=True)

df.loc[df['Payment_of_Min_Amount'] == "NM", 'Payment_of_Min_Amount'] = np.nan

df['test_Payment_of_Min_Amount'] = df['Payment_of_Min_Amount'].astype(str)
df['test_Payment_of_Min_Amount'] = df['test_Payment_of_Min_Amount'].str.replace('\d+', '', regex=True)
print(df["test_Payment_of_Min_Amount"].value_counts())
df.drop("test_Payment_of_Min_Amount", axis=1, inplace=True)

In [None]:
df['test_Amount_invested_monthly'] = df['Amount_invested_monthly'].astype(str)
df['test_Amount_invested_monthly'] = df['test_Amount_invested_monthly'].str.replace('\d+', '', regex=True)
print(df["test_Amount_invested_monthly"].value_counts())
print(df[df["test_Amount_invested_monthly"] =="____"])
df.drop("test_Amount_invested_monthly", axis=1, inplace=True)

# What do the __10000__ mean?

In [None]:
df['test_Payment_Behaviour'] = df['Payment_Behaviour'].astype(str)
df['test_Payment_Behaviour'] = df['test_Payment_Behaviour'].str.replace('\d+', '', regex=True)
print(df["test_Payment_Behaviour"].value_counts())
df.drop("test_Payment_Behaviour", axis=1, inplace=True)

df.loc[df['Payment_Behaviour'] == "!@9#%8", 'Payment_Behaviour'] = np.nan

df['test_Payment_Behaviour'] = df['Payment_Behaviour'].astype(str)
df['test_Payment_Behaviour'] = df['test_Payment_Behaviour'].str.replace('\d+', '', regex=True)
print(df["test_Payment_Behaviour"].value_counts())
df.drop("test_Payment_Behaviour", axis=1, inplace=True)

In [None]:
df['test_Monthly_Balance'] = df['Monthly_Balance'].astype(str)
df['test_Monthly_Balance'] = df['test_Monthly_Balance'].str.replace('\d+', '', regex=True)
print(df["test_Monthly_Balance"].value_counts())
df.drop("test_Monthly_Balance", axis=1, inplace=True)

df.loc[df['Monthly_Balance'] == "__-333333333333333333333333333__", 'Monthly_Balance'] = np.nan

df['test_Monthly_Balance'] = df['Monthly_Balance'].astype(str)
df['test_Monthly_Balance'] = df['test_Monthly_Balance'].str.replace('\d+', '', regex=True)
print(df["test_Monthly_Balance"].value_counts())
df.drop("test_Monthly_Balance", axis=1, inplace=True)