In [7]:
import pandas as pd
import numpy as np

In [8]:
# no of customers
nums_customers = 25000

customer_ids = list(range(1, nums_customers + 1))
df = pd.DataFrame({"customer_id": customer_ids})

df.to_csv("creditData.csv", index=False)
df.head()

Unnamed: 0,customer_id
0,1
1,2
2,3
3,4
4,5


In [9]:
#randomly generating gender
np.random.seed(42)
df['gender'] = np.random.choice(["male","female"], size=len(df))
df.head()

Unnamed: 0,customer_id,gender
0,1,male
1,2,female
2,3,male
3,4,male
4,5,male


In [11]:
# generating Location top 100 cities
cities = [
    "Mumbai", "Delhi", "Bangalore", "Hyderabad", "Ahmedabad", "Chennai", "Kolkata", "Surat", "Pune", "Jaipur",
    "Lucknow", "Kanpur", "Nagpur", "Visakhapatnam", "Indore", "Thane", "Bhopal", "Coimbatore", "Mysore", "Vadodara",
    "Ludhiana", "Agra", "Nashik", "Patna", "Ghaziabad", "Faridabad", "Rajkot", "Meerut", "Kochi", "Chandigarh",
    "Madurai", "Vijayawada", "Gurugram", "Noida", "Howrah", "Dhanbad", "Solapur", "Jalandhar", "Bhubaneswar", "Ranchi",
    "Amritsar", "Aurangabad", "Udaipur", "Gwalior", "Jammu", "Kolhapur", "Siliguri", "Aligarh", "Raipur", "Chandrapur",
    "Muzaffarpur", "Gaya", "Saharanpur", "Muzzafarnagar", "Raigarh", "Ambala", "Panipat", "Jamshedpur", "Bikaner",
    "Tirunelveli", "Srinagar", "Baroda", "Nagapattinam", "Kottayam", "Moradabad", "Kurnool", "Belgaum", "Jalna",
    "Satna", "Agartala", "Gurgaon", "Bareilly", "Bhavnagar", "Patiala", "Shirdi", "Hoshiarpur", "Nellore", "Durgapur",
    "Bhilwara", "Raebareli", "Jind", "Rishikesh", "Firozabad", "Kollam", "Tirupati", "Erode", "Dharwad", "Bilaspur",
    "Haldwani", "Amaravati", "Itanagar", "Dispur", "Panaji", "Gandhinagar", "Shimla", "Thiruvananthapuram", "Imphal",
    "Shillong", "Aizawl", "Kohima", "Gangtok", "Dehradun", "Port Blair", "Daman", "Kavaratti", "New Delhi", "Puducherry"
]

df["location"] = np.random.choice(cities, size=len(df))
df.head()

Unnamed: 0,customer_id,gender,location
0,1,male,Noida
1,2,female,Raebareli
2,3,male,Shimla
3,4,male,Haldwani
4,5,male,Port Blair


In [12]:
#age group with there % in the population 

age_groups = {
    (21,30): 0.55,
    (31,40): 0.18,
    (41,50): 0.12,
    (51,60): 0.08,
    (61,70): 0.07
}

total_id = len(df)
age_values = []

for (age_min, age_max), fraction in age_groups.items():
    count = int(total_id*fraction)
    ages = np.random.randint(age_min, age_max + 1, size = count)
    age_values.extend(ages)

np.random.shuffle(age_values)
df["age"] = age_values
df.head()

Unnamed: 0,customer_id,gender,location,age
0,1,male,Noida,28
1,2,female,Raebareli,26
2,3,male,Shimla,25
3,4,male,Haldwani,24
4,5,male,Port Blair,24


In [13]:
#monthly income respect to the age 
 
def income_for_dif_age(age):
    if 21 <= age <= 30:
        return np.random.randint(20000, 50001)
    elif 31 <= age <= 40:
        return np.random.randint(50000, 100001)
    elif 41 <= age <= 50:
        return np.random.randint(100000, 150001)
    elif 51 <= age <= 60:
        return np.random.randint(150000, 200001)
    elif 61 <= age <= 70:
        return np.random.randint(200000, 250001)
    else:
        return np.nan
    
df["monthly_income"] = df["age"].apply(income_for_dif_age)
df.head()


Unnamed: 0,customer_id,gender,location,age,monthly_income
0,1,male,Noida,28,29464
1,2,female,Raebareli,26,43560
2,3,male,Shimla,25,28564
3,4,male,Haldwani,24,36844
4,5,male,Port Blair,24,21061


In [14]:
# monthly EMI outflow 
#logic is stated in the readme its mostly depend upon the age and income  and do they follow the 30% EMI rule we took the data such that this much people follow the 30% people follow the 30% months income as EMI 

def gen_emi(row):
    age = row["age"]
    income = row["monthly_income"]

    if 21 <= age <=30:
        follow_rule = 0.20
    elif 31 <= age <=40:
        follow_rule = 0.30
    elif 41 <= age <=50:
        follow_rule = 0.40
    elif 51 <= age <=60:
        follow_rule = 0.50
    elif 61 <= age <=70:
        follow_rule = 0.60
    else:
        follow_rule = 0

    if np.random.rand() < follow_rule:
        emi = np.random.randint(0,0.3*income)
    else:
        emi = np.random.randint(0.3*income, 0.95*income)

    return emi if emi > 1000 else 0 

df["monthly_emi_outflow"] = df.apply(gen_emi, axis = 1)
df.head()



Unnamed: 0,customer_id,gender,location,age,monthly_income,monthly_emi_outflow
0,1,male,Noida,28,29464,3160
1,2,female,Raebareli,26,43560,33431
2,3,male,Shimla,25,28564,19916
3,4,male,Haldwani,24,36844,27352
4,5,male,Port Blair,24,21061,18494


In [15]:
#credit card limit generated by as its usually 2-3 times that of the income 

def gen_credit_limit(income):
    raw_limit = income*np.random.uniform(2,3)
    return int(np.round(raw_limit/10000)*10000)

df["total_credit_limit"] = df["monthly_income"].apply(gen_credit_limit)
df.head()

Unnamed: 0,customer_id,gender,location,age,monthly_income,monthly_emi_outflow,total_credit_limit
0,1,male,Noida,28,29464,3160,80000
1,2,female,Raebareli,26,43560,33431,130000
2,3,male,Shimla,25,28564,19916,90000
3,4,male,Haldwani,24,36844,27352,80000
4,5,male,Port Blair,24,21061,18494,50000


In [16]:
# current outstanding so we are calucating this with help of the EMI so we are multipling the EMI with 1-36 months most common periods of any loan Tenure. generally current outstanding is multiple of EMI thats what a small assumetion with some tweaks

#tenure probability for different age bracket short tenure (1-12) months 
tenure_prob= { 
    (21,30): 0.70,
    (31,40): 0.60,
    (41,50): 0.50,
    (51,60): 0.40,
    (61,70): 0.70
}

def gen_outstanding(row):
    age = row["age"]
    emi = row["monthly_emi_outflow"]
    if emi == 0:
        return 0
    
    p_short = None
    for (low, high), p in tenure_prob.items():
        if low <= age <=high:
            p_short = p
            break
    
    if np.random.rand() < p_short:
        T = np.random.randint(1,13)
    else:
        T = np.random.randint(13,25)
    
    interest_factor = np.random.choice([0.08,0.09,0.10])
    z = emi*(T + interest_factor)

    return int(z)

df["current_outstanding"] = df.apply(gen_outstanding, axis=1)
df.head()

Unnamed: 0,customer_id,gender,location,age,monthly_income,monthly_emi_outflow,total_credit_limit,current_outstanding
0,1,male,Noida,28,29464,3160,80000,12955
1,2,female,Raebareli,26,43560,33431,130000,69870
2,3,male,Shimla,25,28564,19916,90000,21907
3,4,male,Haldwani,24,36844,27352,80000,330412
4,5,male,Port Blair,24,21061,18494,50000,223592


In [17]:
# credit utilization ration logic is that certain age group people follow 30% credit card usage and then simple formula of ratio = total usage / total limit

util_follow = {
    (21,30): 0.45,
    (31,40): 0.55,
    (41,50): 0.60,
    (51,60): 0.70,
    (61,70): 0.80,
}

def gen_utlization(row):
    age = row["age"]
    limit = row["total_credit_limit"]
    if limit == 0:
        return 0.0
    # p_short = None
    # for (low, high), p in tenure_prob.items():
    #     if low <= age <=high:
    #         p_short = p
    #         break
    p_follow = None
    for (low, high), p in util_follow.items():
        if low <= age <= high:
            p_follow = p
            break
    
    if np.random.rand() < p_follow:
        max_use = int(0.3*limit)
        usage = np.random.randint(0, max_use + 1)
    else:
        low = int(0.3*limit)
        high = int(limit)
        usage = np.random.randint(low, high + 1)
    
    return round(usage/limit, 2)

df["credit_utilization_ratio"] = df.apply(gen_utlization, axis=1)
df.head()



Unnamed: 0,customer_id,gender,location,age,monthly_income,monthly_emi_outflow,total_credit_limit,current_outstanding,credit_utilization_ratio
0,1,male,Noida,28,29464,3160,80000,12955,0.14
1,2,female,Raebareli,26,43560,33431,130000,69870,0.76
2,3,male,Shimla,25,28564,19916,90000,21907,0.6
3,4,male,Haldwani,24,36844,27352,80000,330412,0.34
4,5,male,Port Blair,24,21061,18494,50000,223592,0.79


In [18]:
# no of open loans there are maninly 7 types of loans inculiding credit card loan so this logic depends on age bracket which age bracket can have max no of loan for there respective age most them follow the rule only some will not follow the rule 

max_loans_by_age = {
    (21,30): 3,
    (31,40): 4,
    (41,50): 3,
    (51,60): 2,
    (61,70): 1,
}

no_outlier_bracket = [(21,30), (61,70)]

def gen_no_of_loans(row):
    age = row["age"]
    max_allowed = None
    for (low, high), p in max_loans_by_age.items():
        if low <= age <= high:
            max_allowed = p
            break
    
    if any(low <=age <= high for (low, high) in no_outlier_bracket):
        return np.random.randint(0, max_allowed + 1)
    
    if np.random.rand() < 0.95:
        return np.random.randint(0, max_allowed + 1)
    else:
        return np.random.randint(max_allowed + 1, 8)
    
df["num_open_loans"] = df.apply(gen_no_of_loans, axis=1)
df.head()


Unnamed: 0,customer_id,gender,location,age,monthly_income,monthly_emi_outflow,total_credit_limit,current_outstanding,credit_utilization_ratio,num_open_loans
0,1,male,Noida,28,29464,3160,80000,12955,0.14,3
1,2,female,Raebareli,26,43560,33431,130000,69870,0.76,2
2,3,male,Shimla,25,28564,19916,90000,21907,0.6,3
3,4,male,Haldwani,24,36844,27352,80000,330412,0.34,3
4,5,male,Port Blair,24,21061,18494,50000,223592,0.79,3


In [19]:
# repayment history score same logic depends on age and divided into the part having differnt probability of having differnt score
repay_dist = {
    (21, 30): [(80, 90, 0.10), (70, 79, 0.40),(50, 69, 0.35),(30, 49, 0.15)   ],
    (31, 40): [(80, 90, 0.15),(70, 79, 0.45),(50, 69, 0.30),(30, 49, 0.10)],
    (41, 50): [(80, 90, 0.20),(70, 79, 0.45),(50, 69, 0.25),(30, 49, 0.10)],
    (51, 60): [(80, 90, 0.25),(70, 79, 0.50),(50, 69, 0.20),(30, 49, 0.05)],
    (61, 70): [(80, 90, 0.30),(70, 79, 0.55),(50, 69, 0.13),(30, 49, 0.02)]
}

def gen_repayment_score(row):
    age =row["age"]
    scores = None
    for (low, high), p in repay_dist.items():
        if low <= age <= high:
            scores = p
            break
    
    probs = [p for (_,_, p ) in scores]
    range_idx = np.random.choice(len(scores), p = probs)
    low, high, _ = scores[range_idx]
    return np.random.randint(low, high+1)

df["repayment_history_score"] = df.apply(gen_repayment_score, axis=1)
df.head()




Unnamed: 0,customer_id,gender,location,age,monthly_income,monthly_emi_outflow,total_credit_limit,current_outstanding,credit_utilization_ratio,num_open_loans,repayment_history_score
0,1,male,Noida,28,29464,3160,80000,12955,0.14,3,59
1,2,female,Raebareli,26,43560,33431,130000,69870,0.76,2,43
2,3,male,Shimla,25,28564,19916,90000,21907,0.6,3,50
3,4,male,Haldwani,24,36844,27352,80000,330412,0.34,3,55
4,5,male,Port Blair,24,21061,18494,50000,223592,0.79,3,89


In [20]:
# months since last defaults this data is for 3 years only that assumtion
default_distribution = {
    (21, 25): [(0, 0, 0.70), (0, 6, 0.30), (6, 24, 0), (24, 36, 0)], 
    (26, 30): [(0, 0, 0.70), (0, 6, 0.12), (6, 24, 0.10), (24, 36, 0.08)],
    (31, 40): [(0, 0, 0.75), (0, 6, 0.08), (6, 24, 0.09), (24, 36, 0.08)],
    (41, 50): [(0, 0, 0.82), (0, 6, 0.05), (6, 24, 0.07), (24, 36, 0.06)],
    (51, 60): [(0, 0, 0.88), (0, 6, 0.03), (6, 24, 0.05), (24, 36, 0.04)],
    (61, 70): [(0, 0, 0.92), (0, 6, 0.02), (6, 24, 0.03), (24, 36, 0.03)],
}

def gen_default(row):
    age = row["age"]
    time_ranges = None
    for (low, high), p in default_distribution.items():
        if low <= age <= high:
            time_ranges = p 
            break

    ranges  = [(low, high, prob) for low, high, prob in time_ranges]

    range_idx = np.random.choice(len(ranges), p=[prob for _, _, prob in ranges]) 
    low, high, _ = ranges[range_idx] 

    return np.random.randint(low, high + 1)

df["months_since_last_default"] = df.apply(gen_default, axis=1) 
df.head()


Unnamed: 0,customer_id,gender,location,age,monthly_income,monthly_emi_outflow,total_credit_limit,current_outstanding,credit_utilization_ratio,num_open_loans,repayment_history_score,months_since_last_default
0,1,male,Noida,28,29464,3160,80000,12955,0.14,3,59,31
1,2,female,Raebareli,26,43560,33431,130000,69870,0.76,2,43,29
2,3,male,Shimla,25,28564,19916,90000,21907,0.6,3,50,0
3,4,male,Haldwani,24,36844,27352,80000,330412,0.34,3,55,0
4,5,male,Port Blair,24,21061,18494,50000,223592,0.79,3,89,0


In [21]:
# day past due logic same age wise

dpd_distribution = {
    (21, 30): [(0, 0, 0.65), (15, 30, 0.25), (60, 90, 0.10)],  
    (31, 40): [(0, 0, 0.75), (15, 30, 0.18), (60, 90, 0.07)], 
    (41, 50): [(0, 0, 0.83), (15, 30, 0.13), (60, 90, 0.04)],      
    (51, 60): [(0, 0, 0.90), (15, 30, 0.08), (60, 90, 0.02)],     
    (61, 70): [(0, 0, 0.95), (15, 30, 0.04), (60, 90, 0.01)],
}

def gen_dpd(row):
    age = row["age"]
    date_ranges = None
    for (low, high), p in dpd_distribution.items():
        if low <= age <= high:
            date_ranges = p
            break
    
    ranges = [(low, high, prob) for low, high, prob in date_ranges]

    range_idx = np.random.choice(len(ranges), p=[prob for _,_, prob in ranges])
    low, high, _ = ranges[range_idx]

    return np.random.randint(low, high + 1)

df["dpd_last_3_months"] = df.apply(gen_dpd, axis=1)
df.head()


Unnamed: 0,customer_id,gender,location,age,monthly_income,monthly_emi_outflow,total_credit_limit,current_outstanding,credit_utilization_ratio,num_open_loans,repayment_history_score,months_since_last_default,dpd_last_3_months
0,1,male,Noida,28,29464,3160,80000,12955,0.14,3,59,31,17
1,2,female,Raebareli,26,43560,33431,130000,69870,0.76,2,43,29,0
2,3,male,Shimla,25,28564,19916,90000,21907,0.6,3,50,0,0
3,4,male,Haldwani,24,36844,27352,80000,330412,0.34,3,55,0,0
4,5,male,Port Blair,24,21061,18494,50000,223592,0.79,3,89,0,81


In [22]:
# no of hard inquires

inquiry_distribution = {
    (21, 30): {
        "Excellent": (0, 1),
        "Good": (1, 2),
        "Fair": (2, 4),
        "Poor": (4, 8)
    },
    (31, 40): {
        "Excellent": (0, 1),
        "Good": (1, 2),
        "Fair": (2, 3),
        "Poor": (3, 6)
    },
    (41, 50): {
        "Excellent": (0, 1),
        "Good": (1, 2),
        "Fair": (1, 2),
        "Poor": (2, 4)
    },
    (51, 60): {
        "Excellent": (0, 1),
        "Good": (0, 1),
        "Fair": (1, 2),
        "Poor": (2, 3)
    },
    (61, 70): {
        "Excellent": (0, 1),
        "Good": (0, 1),
        "Fair": (1, 1),
        "Poor": (1, 2)
    }
}

def gen_inquiries(row):
    age = row["age"]
    repayment_score = row["repayment_history_score"]

    if repayment_score >= 85:
        score_category = "Excellent"
    elif repayment_score >= 70:
        score_category = "Good"
    elif repayment_score >= 50:
        score_category = "Fair"
    else:
        score_category = "Poor"

    ranges = None
    for (min_age, max_age), p in inquiry_distribution.items():
        if min_age <= age <= max_age:
            ranges = p
            break
    
    inquiries_range = ranges[score_category]
    num_inquiries = np.random.randint(inquiries_range[0], inquiries_range[1] + 1)

    return num_inquiries

df["num_hard_inquiries_last_6m"] = df.apply(gen_inquiries, axis=1)
df.head()


Unnamed: 0,customer_id,gender,location,age,monthly_income,monthly_emi_outflow,total_credit_limit,current_outstanding,credit_utilization_ratio,num_open_loans,repayment_history_score,months_since_last_default,dpd_last_3_months,num_hard_inquiries_last_6m
0,1,male,Noida,28,29464,3160,80000,12955,0.14,3,59,31,17,3
1,2,female,Raebareli,26,43560,33431,130000,69870,0.76,2,43,29,0,6
2,3,male,Shimla,25,28564,19916,90000,21907,0.6,3,50,0,0,4
3,4,male,Haldwani,24,36844,27352,80000,330412,0.34,3,55,0,0,4
4,5,male,Port Blair,24,21061,18494,50000,223592,0.79,3,89,0,81,0


In [23]:
# credit card usage distribution of 3 months 

usage_distribution = {
    (21, 30): {
        "Excellent": (0.10, 0.25),
        "Good": (0.20, 0.35),
        "Fair": (0.30, 0.50),
        "Poor": (0.50, 0.80),
    },
    (31, 40): {
        "Excellent": (0.08, 0.20),
        "Good": (0.15, 0.30),
        "Fair": (0.25, 0.45),
        "Poor": (0.40, 0.75),
    },
    (41, 50): {
        "Excellent": (0.05, 0.15),
        "Good": (0.10, 0.25),
        "Fair": (0.20, 0.40),
        "Poor": (0.35, 0.65),
    },
    (51, 60): {
        "Excellent": (0.03, 0.12),
        "Good": (0.08, 0.20),
        "Fair": (0.15, 0.30),
        "Poor": (0.25, 0.50),
    },
    (61, 70): {
        "Excellent": (0.02, 0.10),
        "Good": (0.05, 0.15),
        "Fair": (0.10, 0.25),
        "Poor": (0.15, 0.35),
    },
}

def gen_credit_card_usage(row):
    age = row["age"]
    repayment_score = row["repayment_history_score"]
    limit = row["total_credit_limit"]

    if repayment_score >= 85:
        score_category = "Excellent"
    elif repayment_score >= 70:
        score_category = "Good"
    elif repayment_score >= 50:
        score_category = "Fair"
    else:
        score_category = "Poor"

    ranges = None
    for (low, high), p in usage_distribution.items():
        if low <= age <= high:
            ranges = p
            break
    usage_range = ranges[score_category]

    usage_percentage = np.random.uniform(usage_range[0], usage_range[1])

    return round(usage_percentage * limit , 2)

df["recent_credit_card_usage"] = df.apply(gen_credit_card_usage, axis=1)
df.head()

Unnamed: 0,customer_id,gender,location,age,monthly_income,monthly_emi_outflow,total_credit_limit,current_outstanding,credit_utilization_ratio,num_open_loans,repayment_history_score,months_since_last_default,dpd_last_3_months,num_hard_inquiries_last_6m,recent_credit_card_usage
0,1,male,Noida,28,29464,3160,80000,12955,0.14,3,59,31,17,3,29882.13
1,2,female,Raebareli,26,43560,33431,130000,69870,0.76,2,43,29,0,6,75670.64
2,3,male,Shimla,25,28564,19916,90000,21907,0.6,3,50,0,0,4,41283.37
3,4,male,Haldwani,24,36844,27352,80000,330412,0.34,3,55,0,0,4,26126.75
4,5,male,Port Blair,24,21061,18494,50000,223592,0.79,3,89,0,81,0,7798.58


In [24]:
 # recent loan disbursed amount

loan_distribution = {
    (21, 30): {
        "Excellent": (200000, 500000),
        "Good": (100000, 300000),
        "Fair": (50000, 200000),
        "Poor": (0, 50000),
    },
    (31, 40): {
        "Excellent": (500_000, 1200000),
        "Good": (200000, 700000),
        "Fair": (100000, 400000),
        "Poor": (0, 100000),
    },
    (41, 50): {
        "Excellent": (1000000, 1800000),
        "Good": (500000, 1200000),
        "Fair": (200000, 600000),
        "Poor": (0, 200000),
    },
    (51, 60): {
        "Excellent": (800000, 1500000),
        "Good": (400000, 1000000),
        "Fair": (100000, 500000),
        "Poor": (0, 100000),
    },
    (61, 70): {
        "Excellent": (500000, 1000000),
        "Good": (200000, 600000),
        "Fair": (50000, 300000),
        "Poor": (0, 50000),
    },
}

def gen_recent_loan(row):
    age = row["age"]
    income = row["monthly_income"]
    repayment_score = row["repayment_history_score"]

    # Define the repayment score categories
    if repayment_score >= 85:
        score_category = "Excellent"
    elif repayment_score >= 70:
        score_category = "Good"
    elif repayment_score >= 50:
        score_category = "Fair"
    else:
        score_category = "Poor"

    ranges = None
    for (low, high), p in loan_distribution.items():
        if low <= age <= high:
            ranges = p
            break
    
    loan_range = ranges[score_category]
    
    loan_amount = np.random.randint(loan_range[0], loan_range[1] + 1)
    return loan_amount

df["recent_loan_disbursed_amount"] = df.apply(gen_recent_loan, axis=1)
df.head()

Unnamed: 0,customer_id,gender,location,age,monthly_income,monthly_emi_outflow,total_credit_limit,current_outstanding,credit_utilization_ratio,num_open_loans,repayment_history_score,months_since_last_default,dpd_last_3_months,num_hard_inquiries_last_6m,recent_credit_card_usage,recent_loan_disbursed_amount
0,1,male,Noida,28,29464,3160,80000,12955,0.14,3,59,31,17,3,29882.13,97194
1,2,female,Raebareli,26,43560,33431,130000,69870,0.76,2,43,29,0,6,75670.64,6799
2,3,male,Shimla,25,28564,19916,90000,21907,0.6,3,50,0,0,4,41283.37,135404
3,4,male,Haldwani,24,36844,27352,80000,330412,0.34,3,55,0,0,4,26126.75,96988
4,5,male,Port Blair,24,21061,18494,50000,223592,0.79,3,89,0,81,0,7798.58,241208


In [None]:
def gen_credit_score_movement(row):
    dpd = row["dpd_last_3_months"]
    months_since_last_default = row["months_since_last_default"]
    credit_utilization = row["credit_utilization_ratio"]
    num_hard_inquiries = row["num_hard_inquiries_last_6m"]
    repayment_score = row["repayment_history_score"]
    emi_income_ratio = row["monthly_emi_outflow"] / row["monthly_income"]
    recent_loan_disbursed = row["recent_loan_disbursed_amount"] / row["monthly_income"]
    recent_credit_card_usage = row["recent_credit_card_usage"] / row["total_credit_limit"]

    # negative indicators 
    negative_indicators = 0
    if dpd >= 30: negative_indicators += 1
    if months_since_last_default <= 3: negative_indicators += 1  
    if credit_utilization >= 0.75: negative_indicators += 1
    if num_hard_inquiries >= 5: negative_indicators += 1
    if repayment_score < 50: negative_indicators += 1
    if emi_income_ratio >= 0.7: negative_indicators += 1
    if recent_loan_disbursed >= 7: negative_indicators += 1
    if recent_credit_card_usage >= 0.75: negative_indicators += 1

    # positive indicators 
    positive_indicators = 0
    if dpd == 0: positive_indicators += 1
    if months_since_last_default > 24 or months_since_last_default == 0: positive_indicators += 1
    if credit_utilization < 0.20: positive_indicators += 1
    if num_hard_inquiries == 0: positive_indicators += 1
    if repayment_score >= 85: positive_indicators += 1
    if emi_income_ratio < 0.20: positive_indicators += 1
    if recent_loan_disbursed <= 1: positive_indicators += 1
    if recent_credit_card_usage < 0.10: positive_indicators += 1

    if negative_indicators >= 4:  
        return "Decrease" 
    elif positive_indicators >= 4:  
        return "Increase"  
    else:
        return "Stable"  

# Apply to DataFrame
df["target_credit_score_movement"] = df.apply(gen_credit_score_movement, axis=1)
df.head()



Unnamed: 0,customer_id,gender,location,age,monthly_income,monthly_emi_outflow,total_credit_limit,current_outstanding,credit_utilization_ratio,num_open_loans,repayment_history_score,months_since_last_default,dpd_last_3_months,num_hard_inquiries_last_6m,recent_credit_card_usage,recent_loan_disbursed_amount,target_credit_score_movement
0,1,male,Noida,28,29464,3160,80000,12955,0.14,3,59,31,17,3,29882.13,97194,Stable
1,2,female,Raebareli,26,43560,33431,130000,69870,0.76,2,43,29,0,6,75670.64,6799,Decrease
2,3,male,Shimla,25,28564,19916,90000,21907,0.6,3,50,0,0,4,41283.37,135404,Stable
3,4,male,Haldwani,24,36844,27352,80000,330412,0.34,3,55,0,0,4,26126.75,96988,Stable
4,5,male,Port Blair,24,21061,18494,50000,223592,0.79,3,89,0,81,0,7798.58,241208,Decrease


In [28]:
# Frequency and percentage distribution
movement_frequency = df["target_credit_score_movement"].value_counts()
print("Frequency distribution of target_credit_score_movement:")
print(movement_frequency)

# Calculate percentage distribution
percentage_distribution = (movement_frequency / len(df)) * 100
print("\nPercentage distribution of credit score movement:")
print(percentage_distribution)

Frequency distribution of target_credit_score_movement:
target_credit_score_movement
Stable      19838
Increase     3775
Decrease     1387
Name: count, dtype: int64

Percentage distribution of credit score movement:
target_credit_score_movement
Stable      79.352
Increase    15.100
Decrease     5.548
Name: count, dtype: float64


In [29]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   customer_id                   25000 non-null  int64  
 1   gender                        25000 non-null  object 
 2   location                      25000 non-null  object 
 3   age                           25000 non-null  int32  
 4   monthly_income                25000 non-null  int64  
 5   monthly_emi_outflow           25000 non-null  int64  
 6   total_credit_limit            25000 non-null  int64  
 7   current_outstanding           25000 non-null  int64  
 8   credit_utilization_ratio      25000 non-null  float64
 9   num_open_loans                25000 non-null  int64  
 10  repayment_history_score       25000 non-null  int64  
 11  months_since_last_default     25000 non-null  int64  
 12  dpd_last_3_months             25000 non-null  int64  
 13  n

Unnamed: 0,customer_id,age,monthly_income,monthly_emi_outflow,total_credit_limit,current_outstanding,credit_utilization_ratio,num_open_loans,repayment_history_score,months_since_last_default,dpd_last_3_months,num_hard_inquiries_last_6m,recent_credit_card_usage,recent_loan_disbursed_amount
count,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0
mean,12500.5,34.90868,77479.92168,33943.82692,193841.6,385756.1,0.383339,1.55656,67.4684,2.71912,9.90444,2.09512,46031.849199,322760.1
std,7217.022701,13.080963,60723.950654,33835.226231,154764.518517,514816.8,0.291337,1.310324,13.954589,7.28382,20.46343,1.612457,29963.727034,296856.1
min,1.0,21.0,20001.0,0.0,40000.0,0.0,0.0,0.0,30.0,0.0,0.0,0.0,4854.09,43.0
25%,6250.75,25.0,33652.0,13219.5,80000.0,81448.5,0.14,0.0,58.0,0.0,0.0,1.0,25635.94,125220.5
50%,12500.5,30.0,47272.0,23396.5,120000.0,205786.0,0.28,1.0,71.0,0.0,0.0,2.0,37994.47,219839.5
75%,18750.25,42.0,108004.5,40511.75,270000.0,459900.5,0.62,3.0,77.0,0.0,16.0,3.0,58057.225,439666.5
max,25000.0,70.0,249938.0,235960.0,750000.0,5426055.0,1.0,7.0,90.0,36.0,90.0,8.0,269306.09,1794301.0


In [30]:
df

Unnamed: 0,customer_id,gender,location,age,monthly_income,monthly_emi_outflow,total_credit_limit,current_outstanding,credit_utilization_ratio,num_open_loans,repayment_history_score,months_since_last_default,dpd_last_3_months,num_hard_inquiries_last_6m,recent_credit_card_usage,recent_loan_disbursed_amount,target_credit_score_movement
0,1,male,Noida,28,29464,3160,80000,12955,0.14,3,59,31,17,3,29882.13,97194,Stable
1,2,female,Raebareli,26,43560,33431,130000,69870,0.76,2,43,29,0,6,75670.64,6799,Decrease
2,3,male,Shimla,25,28564,19916,90000,21907,0.60,3,50,0,0,4,41283.37,135404,Stable
3,4,male,Haldwani,24,36844,27352,80000,330412,0.34,3,55,0,0,4,26126.75,96988,Stable
4,5,male,Port Blair,24,21061,18494,50000,223592,0.79,3,89,0,81,0,7798.58,241208,Decrease
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,24996,female,Udaipur,21,25828,9044,70000,18992,0.44,3,53,0,0,4,24673.60,77404,Stable
24996,24997,female,Raebareli,26,43584,31762,120000,66382,0.04,0,49,0,0,7,72424.21,42044,Decrease
24997,24998,male,Dhanbad,35,79688,42175,160000,88567,0.17,4,59,0,0,3,51634.39,205401,Stable
24998,24999,female,Jalna,23,42776,32701,110000,35644,0.25,3,42,0,19,8,67294.63,29600,Decrease


In [31]:
df.to_csv("creditData.csv", index=False)