In [3]:
import pandas as pd 
from statsmodels.stats import weightstats as stests
import numpy as np
from scipy.stats import norm
from scipy.stats import chi2_contingency
from statsmodels.stats.weightstats import ztest as ztest
from scipy.stats import ttest_ind
from statsmodels.stats import weightstats
from scipy.stats import kruskal
from scipy.stats import spearmanr
from scipy.stats import f_oneway

In [4]:
# Loading the data
df = pd.read_csv("After_dropping_corr_features2.csv")

In [5]:
# Null Hypothesis 1
print("Null Hypothsis 1: \n")
print("H0: Customers with a higher 'Num_of_Loan' are more likely to have a 'Poor' Credit_Score.")

Null Hypothsis 1: 

H0: Customers with a higher 'Num_of_Loan' are more likely to have a 'Poor' Credit_Score.


In [6]:
# Split according to hypothesis
num_of_loan_poor = df[df['Credit_Score_encoded'] == 0]['Num_of_Loan']
num_of_loan_not_poor = df[df['Credit_Score_encoded'] != 0]['Num_of_Loan']

# Calculating the mean and the standard deviation
mean_poor = np.mean(num_of_loan_poor)
std_poor = np.std(num_of_loan_poor)
mean_not_poor = np.mean(num_of_loan_not_poor)
std_not_poor = np.std(num_of_loan_not_poor)

# Z-testing
z_score_1, p_value_1 = ztest(num_of_loan_poor, num_of_loan_not_poor, alternative = 'larger')

# Define alapha
alpha = 0.05

# Results
print(f"Z-Score: {z_score_1}")
print(f"P-Value: {p_value_1}")
print("=======================================================")

if p_value_1 < alpha:
    print("Reject the null hypothesis. Num_of_Loan isn't necessarily associated with poor Credit_Score")
else:
    print("Fail to reject the null hypothesis.")


Z-Score: -82.82552300970706
P-Value: 1.0
Fail to reject the null hypothesis.


In [10]:
# Null Hypothesis 2
print("Null Hypothsis 2: \n")
print("H0: The customer's Occupation doesn't affect the Credit_Score")

Null Hypothsis 2: 

H0: The customer's Occupation doesn't affect the Credit_Score


In [11]:
# Both of these columns are categorical
# Using Chi-squared

# Create a contingency table
contingency_table = pd.crosstab(df['Occupation_encoded'], df['Credit_Score_encoded'])

# Perform the Chi-squared test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# Define alapha
alpha = 0.05

# Results
print(f"Chi-squared value: {chi2}")
print(f"P-value: {p_value}")
print("=======================================================")

if p_value < alpha:
    print("Reject the null hypothesis. There is evidence of association.")
else:
    print("Fail to reject the null hypothesis.")

Chi-squared value: 180.68588839512742
P-value: 2.907103747503745e-24
Reject the null hypothesis. There is evidence of a significant association.


In [19]:
# Hypothesis 3
print("Null Hypothsis 3: \n")
print("H0: Monthly_Balanace doesn't affect the Credit_Score")

Null Hypothsis 3: 

H0: Monthly_Balanace doesn't affect the Credit_Score


In [20]:
# Continuous and categorical
# Perform one-way ANOVA
result = f_oneway(df[df['Credit_Score_encoded'] == 0]['Monthly_Balance'],
                  df[df['Credit_Score_encoded'] == 1]['Monthly_Balance'],
                  df[df['Credit_Score_encoded'] == 2]['Monthly_Balance'])

# Define alpha
alpha = 0.05

# Results 
print(f"F-statistic: {result.statistic}")
print(f"P-value: {result.pvalue}")
print("=======================================================")

if result.pvalue < alpha:
    print("Reject the null hypothesis. There is evidence that Monthly_Balance affects Credit_Score.")
else:
    print("Fail to reject the null hypothesis.")

F-statistic: 2036.2367996378368
P-value: 0.0
Reject the null hypothesis. There is evidence that Monthly_Balance affects Credit_Score.


In [14]:
# Hypothesis 4
print("Null Hypothsis 4: \n")
print("H0: An increase in Credit_Utilization_Ratio equals an increase in Credit_Score")

Null Hypothsis 4: 

H0: An increase in Credit_Utilization_Ratio equals an increase in Credit_Score


In [15]:
# Define a threshold for "lower" credit utilization
lower_utilization_threshold = 30

# Splitting according to hypothesis
lower_utilization_df = df[df["Credit_Utilization_Ratio"] <= lower_utilization_threshold]
higher_utilization_df = df[df["Credit_Utilization_Ratio"] > lower_utilization_threshold]

# # T-test to compare credit scores for each of the groups
t_stat, p_value = ttest_ind(lower_utilization_df["Credit_Score_encoded"], higher_utilization_df["Credit_Score_encoded"])

# Define alpha
alpha = 0.05

# Results
print(f"T-Statistic: {t_stat}")
print(f"P-Value: {p_value}")
print("=======================================================")

if p_value < alpha:
    print("Reject the null hypothesis. There is no evidence that Credit_Utilization_Ratio equals and Credit_Score are linearly associated.")
else:
    print("Fail to reject the null hypothesis.")

T-Statistic: 2.5481448325911895
P-Value: 0.010831231887083641
Reject the null hypothesis. There is no evidence that Credit_Utilization_Ratio equals and Credit_Score are linearly associated.


In [17]:
# Hypothesis 5
print("Null Hypothsis 5: \n")
print("H0: An increase in Num_Bank_Accounts equals an increase in Credit_Score")

Null Hypothsis 5: 

H0: An increase in Num_Bank_Accounts equals an increase in Credit_Score


In [23]:
# Continuous and categorical
# Perform Kruskal-Wallis test
result = kruskal(df[df['Credit_Score_encoded'] == 0]['Num_Bank_Accounts'],
                 df[df['Credit_Score_encoded'] == 1]['Num_Bank_Accounts'],
                 df[df['Credit_Score_encoded'] == 2]['Num_Bank_Accounts'])

# Define alpha
alpha = 0.05

# Results
print(f"H-statistic: {result.statistic}")
print(f"P-value: {result.pvalue}")
print("=======================================================")

if result.pvalue < alpha:
    print("Reject the null hypothesis. There is evidence that the distribution of Num_Bank_Accounts is different across Credit_Score groups.")
else:
    print("Fail to reject the null hypothesis. There is no evidence that the distribution of Num_Bank_Accounts differs across Credit_Score groups.")

H-statistic: 15711.418589102303
P-value: 0.0
Reject the null hypothesis. There is evidence that the distribution of Num_Bank_Accounts is different across Credit_Score groups.


In [30]:
# Hypothesis 6
print("Null Hypothsis 6: \n")
print("H0: An increase in Num_of_Loan equals a decrease in Credit_Score")

Null Hypothsis 6: 

H0: An increase in Num_of_Loan equals a decrease in Credit_Score


In [31]:
# Finding Spearman's rank correlation coefficient and p-value
correlation, p_value = spearmanr(df['Num_of_Loan'], df['Credit_Score_encoded'])

# Define alpha
alpha = 0.05

# Results
print(f"Spearman's Rank Correlation Coefficient: {correlation:.4f}")
print(f"P-value: {p_value:.4f}")
print("=======================================================")

if p_value < alpha:
    print("Reject the null hypothesis. There is evidence of a significant relationship between the number of loans and the credit score.")
else:
    print("Fail to reject the null hypothesis. There is no evidence of a significant relationship between the number of loans and the credit score.")

Spearman's Rank Correlation Coefficient: 0.0123
P-value: 0.0001
Reject the null hypothesis. There is evidence of a significant relationship between the number of loans and the credit score.
