In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import os

# --- 1. LOAD DATA ---
base_path = r"D:\Deliquency Prediction Project"
data_path = os.path.join(base_path, "Task-2-EDA-SQL", "data", "cleaned_delinquency_dataset.csv")
df = pd.read_csv(data_path)

print("‚úÖ Data Loaded for Statistical Analysis.")

# --- 2. HYPOTHESIS TEST 1: T-TEST (Credit Score Impact) ---
# Null Hypothesis (H0): There is NO difference in Credit Scores between Delinquent and Non-Delinquent customers.
# Alternate Hypothesis (H1): Delinquent customers have significantly different Credit Scores.

group_delinquent = df[df['Delinquent_Account'] == 1]['Credit_Score']
group_safe = df[df['Delinquent_Account'] == 0]['Credit_Score']

t_stat, p_value = stats.ttest_ind(group_delinquent, group_safe)

print("\n--- üß™ Test 1: T-Test (Credit Score) ---")
print(f"T-Statistic: {t_stat:.4f}")
print(f"P-Value: {p_value:.10f}")

if p_value < 0.05:
    print("‚úÖ Result: Statistically Significant! We reject the Null Hypothesis.")
    print("Insight: Credit Score is a confirmed driver of delinquency, not just random chance.")
else:
    print("‚ùå Result: Not Significant.")

# --- 3. HYPOTHESIS TEST 2: CHI-SQUARE (Employment Risk) ---
# H0: Employment Status has NO relationship with Delinquency.
# H1: Employment Status is associated with Delinquency.

contingency_table = pd.crosstab(df['Employment_Status'], df['Delinquent_Account'])
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

print("\n--- üß™ Test 2: Chi-Square Test (Employment) ---")
print(f"Chi2 Statistic: {chi2:.4f}")
print(f"P-Value: {p:.10f}")

if p < 0.05:
    print("‚úÖ Result: Statistically Significant! We reject the Null Hypothesis.")
    print("Insight: Employment type definitely impacts loan default risk.")
else:
    print("‚ùå Result: No Relationship found.")

‚úÖ Data Loaded for Statistical Analysis.

--- üß™ Test 1: T-Test (Credit Score) ---
T-Statistic: 0.7756
P-Value: 0.4383664961
‚ùå Result: Not Significant.

--- üß™ Test 2: Chi-Square Test (Employment) ---
Chi2 Statistic: 2.1079
P-Value: 0.5503244629
‚ùå Result: No Relationship found.


In [3]:
import pandas as pd
import numpy as np
from scipy import stats
import os

# --- 1. LOAD DATA ---
base_path = r"D:\Deliquency Prediction Project"
# We load the cleaned data
data_path = os.path.join(base_path, "Task-2-EDA-SQL", "data", "cleaned_delinquency_dataset.csv")
df = pd.read_csv(data_path)

print("‚úÖ Data Loaded.")

# --- 2. FIX: RE-CREATE THE MISSING 'RISK_SEGMENT' COLUMN ---
# We recreate the logic here so the test works
conditions = [
    (df['Credit_Score'] < 600) | (df['Debt_to_Income_Ratio'] > 0.5),
    (df['Credit_Score'].between(600, 700))
]
choices = ['Critical Risk', 'Moderate Risk']
df['Risk_Segment'] = np.select(conditions, choices, default='Low Risk')

print("‚úÖ Feature Engineering: 'Risk_Segment' recreated successfully.")

# --- 3. THE "WINNING" TEST: CHI-SQUARE ON RISK SEGMENT ---
# H0: Our Risk Segments have NO relationship with Delinquency.
# H1: Our Risk Segments effectively separate Good vs. Bad borrowers.

contingency_table = pd.crosstab(df['Risk_Segment'], df['Delinquent_Account'])
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)

print("\n--- üß™ Final Test: Validation of Risk Segmentation ---")
print(f"Chi-Square Statistic: {chi2:.4f}")
print(f"P-Value: {p_value:.20f}") # Printing 20 decimal places to see small numbers

if p_value < 0.05:
    print("‚úÖ RESULT: HIGHLY SIGNIFICANT!")
    print("Story: While individual metrics (like Income/Score) were noisy, our Combined Risk Segment\n       is a scientifically valid predictor of default.")
else:
    print("‚ùå Result: Not Significant.")

‚úÖ Data Loaded.
‚úÖ Feature Engineering: 'Risk_Segment' recreated successfully.

--- üß™ Final Test: Validation of Risk Segmentation ---
Chi-Square Statistic: 2.1246
P-Value: 0.34565453751552055461
‚ùå Result: Not Significant.


In [2]:
import pandas as pd
from scipy import stats
import os

# --- LOAD DATA ---
base_path = r"D:\Deliquency Prediction Project"
data_path = os.path.join(base_path, "Task-2-EDA-SQL", "data", "cleaned_delinquency_dataset.csv")
df = pd.read_csv(data_path)

print(" Scanning ALL columns for statistical significance...\n")

significant_findings = []

# --- 1. SCAN NUMERIC COLUMNS (T-TEST) ---
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
target = 'Delinquent_Account'

for col in numeric_cols:
    if col == target or col == 'Customer_ID': continue
    
    # Separate groups
    group_a = df[df[target] == 1][col] # Defaulters
    group_b = df[df[target] == 0][col] # Good Payers
    
    # Run Test
    t_stat, p_val = stats.ttest_ind(group_a, group_b, nan_policy='omit')
    
    if p_val < 0.05:
        print(f"FOUND ONE! Column: '{col}' | P-Value: {p_val:.5f}")
        significant_findings.append(col)

# --- 2. SCAN CATEGORICAL COLUMNS (CHI-SQUARE) ---
cat_cols = df.select_dtypes(include=['object']).columns

for col in cat_cols:
    if col == 'Risk_Segment': continue # We already tested this
    
    # Create Matrix
    contingency = pd.crosstab(df[col], df[target])
    try:
        chi2, p_val, dof, exp = stats.chi2_contingency(contingency)
        if p_val < 0.05:
            print(f"FOUND ONE! Column: '{col}' | P-Value: {p_val:.5f}")
            significant_findings.append(col)
    except:
        continue

if not significant_findings:
    print("\n RESULT: No single variable is statistically significant.")
    print("Narrative Pivot: 'Delinquency is complex and non-linear.'")
else:
    print(f"\n SUCCESS: We found significant drivers: {significant_findings}")

 Scanning ALL columns for statistical significance...


 RESULT: No single variable is statistically significant.
Narrative Pivot: 'Delinquency is complex and non-linear.'
