In [None]:
import pandas as pd
import numpy as np

# Generate sample continuous variables
np.random.seed(42)  # For reproducibility
x = np.random.normal(loc=50, scale=10, size=1000)
y = x * 0.5 + np.random.normal(loc=0, scale=2, size=1000)  # Add some correlation
y = 1 * y
# Create DataFrame
df = pd.DataFrame({'X': x, 'Y': y})

# Calculate correlation
correlation = df.corr()

print("Correlation Matrix:")
print(correlation)


In [None]:
import sys
sys.path.append('/Users/sandip/Documents/PythonProjects/PythonApps/example_uv/src')


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Generate sample data
np.random.seed(42)  # For reproducibility
x = np.random.normal(loc=50, scale=10, size=100)
y = x * 2 + np.random.normal(loc=0, scale=1, size=100)

# Create a DataFrame
df = pd.DataFrame({'X': -3* x, 'Y': y})

# Calculate the correlation (optional, just to verify)
correlation = df.corr()
print("Correlation Matrix:")
print(correlation)

# Plot the scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(df['X'], df['Y'], color='blue', alpha=0.7)
plt.title("Scatter Plot of X vs Y", fontsize=14)
plt.xlabel("X", fontsize=12)
plt.ylabel("Y", fontsize=12)
plt.grid(True)
plt.show()


In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score




In [None]:
# 1. Generate correlated X1 and X2
np.random.seed(42)
n = 1000
X1 = np.random.normal(0, 1, n)
X2 = 0.8 * X1 + np.sqrt(1 - 0.8**2) * np.random.normal(0, 1, n)  # Correlation ≈ 0.8


In [None]:

# 2. Create target variable Y from a logistic model using both X1 and X2
logit = 1.5 * X1 + 1.5 * X2
prob = 1 / (1 + np.exp(-logit))
Y = np.random.binomial(1, prob)

# 3. Correlation between X1 and X2
corr_X1_X2, _ = pearsonr(X1, X2)
print("Correlation between X1 and X2:", round(corr_X1_X2, 3))

# 4. Logistic regression Y ~ X1 (no intercept)
model_x1 = LogisticRegression(fit_intercept=False, solver='lbfgs')
model_x1.fit(X1.reshape(-1, 1), Y)
pred1 = model_x1.predict_proba(X1.reshape(-1, 1))[:, 1]

# 5. Logistic regression Y ~ X2 (no intercept)
model_x2 = LogisticRegression(fit_intercept=False, solver='lbfgs')
model_x2.fit(X2.reshape(-1, 1), Y)
pred2 = model_x2.predict_proba(X2.reshape(-1, 1))[:, 1]

# 6. Correlation between predicted probabilities
corr_pred1_pred2, _ = pearsonr(pred1, pred2)
print("Correlation between predicted Y1 and Y2:", round(corr_pred1_pred2, 3))

# 7. Compare
print(f"\nOriginal X1-X2 Corr: {corr_X1_X2:.3f}")
print(f"Predicted Y1-Y2 Corr: {corr_pred1_pred2:.3f}")

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)

# Sample size
n = 1000

# Generate correlated predictors X1 and X2 with various correlation values
correlations = np.arange(0, 1.1, 0.1)
results = []

for corr in correlations:
    # Create covariance matrix for correlated variables
    cov_matrix = np.array([[1, corr], [corr, 1]])
    
    # Generate correlated X1 and X2
    X = np.random.multivariate_normal(mean=[0, 0], cov=cov_matrix, size=n)
    X1 = X[:, 0]
    X2 = X[:, 1]
    
    # Calculate direct correlation between X1 and X2
    direct_corr, _ = pearsonr(X1, X2)
    
    # Generate binary target Y (using a linear combination of X1 and X2)
    # We'll use both variables to generate Y to ensure both have some predictive power
    z = 0.7*X1 + 0.7*X2 + np.random.normal(0, 1, n)
    prob = 1 / (1 + np.exp(-z))  # logistic function
    Y = (prob > 0.5).astype(int)
    
    # Fit logistic regression on X1 (without intercept)
    model1 = LogisticRegression(fit_intercept=False, max_iter=1000)
    model1.fit(X1.reshape(-1, 1), Y)
    
    # Fit logistic regression on X2 (without intercept)
    model2 = LogisticRegression(fit_intercept=False, max_iter=1000)
    model2.fit(X2.reshape(-1, 1), Y)
    
    # Get predictions (probabilities)
    pred1 = model1.predict_proba(X1.reshape(-1, 1))[:, 1]
    pred2 = model2.predict_proba(X2.reshape(-1, 1))[:, 1]
    
    # Calculate correlation between predictions
    pred_corr, _ = pearsonr(pred1, pred2)
    
    # Store results
    results.append({
        'Input_Correlation': direct_corr,
        'Prediction_Correlation': pred_corr
    })

# Create dataframe from results
df_results = pd.DataFrame(results)

# Plot the relationship
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Input_Correlation', y='Prediction_Correlation', data=df_results)
plt.plot([0, 1], [0, 1], 'r--', label='y=x line')
plt.title('Correlation: Input Variables vs. Logistic Regression Predictions')
plt.xlabel('Correlation between X1 and X2')
plt.ylabel('Correlation between predictions from X1 and X2 models')
plt.grid(True)
plt.legend()

# Print the results table
print("Results Table:")
print(df_results.round(4))

# Let's also visualize a specific case with scatterplots
# Choose a moderate correlation value (0.5)
corr = 0.5
cov_matrix = np.array([[1, corr], [corr, 1]])
X = np.random.multivariate_normal(mean=[0, 0], cov=cov_matrix, size=n)
X1 = X[:, 0]
X2 = X[:, 1]

# Generate Y
z = 0.7*X1 + 0.7*X2 + np.random.normal(0, 1, n)
prob = 1 / (1 + np.exp(-z))
Y = (prob > 0.5).astype(int)

# Fit models
model1 = LogisticRegression(fit_intercept=False, max_iter=1000)
model1.fit(X1.reshape(-1, 1), Y)

model2 = LogisticRegression(fit_intercept=False, max_iter=1000)
model2.fit(X2.reshape(-1, 1), Y)

# Get predictions
pred1 = model1.predict_proba(X1.reshape(-1, 1))[:, 1]
pred2 = model2.predict_proba(X2.reshape(-1, 1))[:, 1]

# Calculate correlations
direct_corr, _ = pearsonr(X1, X2)
pred_corr, _ = pearsonr(pred1, pred2)

# Create visualization plots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot original variables
axes[0].scatter(X1, X2, c=Y, cmap='coolwarm', alpha=0.6)
axes[0].set_title(f'X1 vs X2 (correlation: {direct_corr:.4f})')
axes[0].set_xlabel('X1')
axes[0].set_ylabel('X2')
axes[0].grid(True)

# Plot predictions
scatter = axes[1].scatter(pred1, pred2, c=Y, cmap='coolwarm', alpha=0.6)
axes[1].set_title(f'Prediction from X1 vs Prediction from X2 (correlation: {pred_corr:.4f})')
axes[1].set_xlabel('Prediction from X1')
axes[1].set_ylabel('Prediction from X2')
axes[1].grid(True)

# Add colorbar
cbar = plt.colorbar(scatter, ax=axes[1])
cbar.set_label('Y value (0 or 1)')

plt.tight_layout()
plt.show()

# Print the correlation values for this specific case
print(f"\nFor input correlation of {direct_corr:.4f}:")
print(f"Correlation between model predictions: {pred_corr:.4f}")
print(f"Ratio (prediction corr / input corr): {pred_corr/direct_corr:.4f}")

In [None]:
import numpy as np
import pandas as pd
import shap
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Step 1: Create correlated features and target
np.random.seed(42)
n_samples = 1000
feature_A = np.random.normal(0, 1, n_samples)
feature_B = feature_A + np.random.normal(0, 0.1, n_samples)  # ~0.9 correlation
y = 2 * feature_A + np.random.normal(0, 0.1, n_samples)

X = pd.DataFrame({'feature_A': feature_A, 'feature_B': feature_B})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Train XGBoost using sklearn API
model = xgb.XGBRegressor(objective="reg:squarederror", max_depth=3, learning_rate=0.1, n_estimators=100)
model.fit(X_train, y_train)

# Step 3: Use TreeExplainer (CPU-friendly)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Step 4: SHAP summary plot
shap.summary_plot(shap_values, X_test, plot_type="bar")

# Step 5: Print average SHAP values
mean_abs_shap = pd.DataFrame({
    "Feature": X_test.columns,
    "Mean |SHAP|": np.abs(shap_values).mean(axis=0)
}).sort_values(by="Mean |SHAP|", ascending=False)

print(mean_abs_shap)


In [None]:
import numpy as np
import pandas as pd
import shap
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Step 1: Create correlated features and target
np.random.seed(42)
n_samples = 10000
feature_A = np.random.normal(0, 1, n_samples)
feature_B = feature_A + np.random.normal(0, 0.1, n_samples)  # ~0.9 correlation
y =  2* feature_A+ 2* feature_B + np.random.normal(0, 0.1, n_samples)

X = pd.DataFrame({'feature_A': feature_A, 'feature_B': feature_B})
XY = pd.DataFrame({'feature_A': feature_A, 'feature_B': feature_B, 'y': y})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Train XGBoost using sklearn API
model = xgb.XGBRegressor(objective="reg:squarederror", max_depth=3, learning_rate=0.1, n_estimators=100)
model.fit(X_train, y_train)

# Step 3: Use TreeExplainer (CPU-friendly)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)




In [None]:
XY['y'].hist(bins=30)


In [None]:
print(XY.corr())
# Convert SHAP values to DataFrame with matching column names
shap_df = pd.DataFrame(shap_values, columns=X_test.columns)

# Now you can compute the correlation matrix
print(shap_df.corr())


In [None]:

# Step 5: Print average SHAP values
mean_abs_shap = pd.DataFrame({
    "Feature": X_test.columns,
    "Mean |SHAP|": np.abs(shap_values).mean(axis=0)
}).sort_values(by="Mean |SHAP|", ascending=False)

print(mean_abs_shap)

In [None]:
# Step 4: SHAP summary plot
shap.summary_plot(shap_values, X_test, plot_type="bar")


In [None]:
import pandas as pd
from sklearn import datasets
from dython.nominal import associations,cluster_correlations

# Load data 
iris = datasets.load_iris()

# Convert int classes to strings to allow associations 
# method to automatically recognize categorical columns
target = ['C{}'.format(i) for i in iris.target]

# Prepare data
X = pd.DataFrame(data=iris.data, columns=iris.feature_names)
y = pd.DataFrame(data=target, columns=['target'])
df = pd.concat([X, y], axis=1)



In [None]:
import pandas as pd
from sklearn import datasets
from dython.nominal import associations,cluster_correlations

In [None]:
# Plot features associationsas


In [None]:
cluster_correlations(associations(df,plot=False)['corr'])

In [None]:
associations(df,plot=False)['corr']

In [None]:
df.drop(columns=['target']).corr()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set random seed for reproducibility
np.random.seed(42)

# Sample size
n = 1000

# Create dataframe
df = pd.DataFrame()

# --------- NOMINAL VARIABLES ---------

# Nominal variable 1: City (with 5 categories)
cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
df['city'] = np.random.choice(cities, size=n)

# Nominal variable 2: Product category (correlated with city)
# Create a conditional probability matrix to establish correlation
# Each row represents a city, each column a product category
product_categories = ['Electronics', 'Clothing', 'Food', 'Books', 'Furniture']
city_product_probs = {
    'New York':    [0.5, 0.2, 0.1, 0.15, 0.05],  # New York has high electronics sales
    'Los Angeles': [0.1, 0.6, 0.1, 0.1, 0.1],    # LA has high clothing sales
    'Chicago':     [0.1, 0.1, 0.6, 0.1, 0.1],    # Chicago has high food sales
    'Houston':     [0.1, 0.1, 0.1, 0.6, 0.1],    # Houston has high book sales
    'Phoenix':     [0.1, 0.1, 0.1, 0.1, 0.6]     # Phoenix has high furniture sales
}

# Generate product categories based on city (creates correlation)
df['product_category'] = df.apply(
    lambda row: np.random.choice(product_categories, p=city_product_probs[row['city']]), 
    axis=1
)

# Nominal variable 3: Payment method (not correlated with other variables)
payment_methods = ['Credit Card', 'Debit Card', 'PayPal', 'Cash', 'Gift Card']
df['payment_method'] = np.random.choice(payment_methods, size=n)

# --------- ORDINAL VARIABLES ---------

# Ordinal variable 1: Customer satisfaction (1-5 scale)
satisfaction_levels = [1, 2, 3, 4, 5]
df['satisfaction'] = np.random.choice(satisfaction_levels, size=n)

# Ordinal variable 2: Education level
education_levels = ['High School', 'Associate', 'Bachelor', 'Master', 'PhD']
df['education'] = np.random.choice(education_levels, size=n)

# --------- CONTINUOUS VARIABLES ---------

# Continuous variable 1: Age (normal distribution)
df['age'] = np.random.normal(40, 15, n)
df['age'] = df['age'].clip(18, 90).round(0)  # Clip to reasonable age range

# Continuous variable 2: Income - correlated with education level
# Define base income means for each education level
education_income_means = {
    'High School': 30000,
    'Associate': 45000,
    'Bachelor': 70000,
    'Master': 100000,
    'PhD': 130000
}

# Generate income based on education with some noise
df['income'] = df.apply(
    lambda row: np.random.normal(education_income_means[row['education']], 
                                education_income_means[row['education']] * 0.2),
    axis=1
)
df['income'] = df['income'].clip(20000, 200000).round(-2)  # Clip and round

# Continuous variable 3: Purchase amount (continuous, in dollars)
df['purchase_amount'] = np.random.gamma(5, 20, n)
df['purchase_amount'] = df['purchase_amount'].round(2)

# Continuous variable 4: Daily screen time (hours) - strongly correlated with product_category
base_screen_time = {
    'Electronics': 8,
    'Clothing': 3,
    'Food': 2,
    'Books': 4,
    'Furniture': 3
}
df['screen_time'] = df.apply(
    lambda row: np.random.normal(base_screen_time[row['product_category']], 1.5),
    axis=1
)
df['screen_time'] = df['screen_time'].clip(0, 16).round(1)  # Clip to reasonable range

# Continuous variable 5: Credit score - not correlated with other variables
df['credit_score'] = np.random.normal(700, 100, n)
df['credit_score'] = df['credit_score'].clip(300, 850).round(0)

# --------- DISCRETE VARIABLES ---------

# Discrete variable 1: Number of purchases (counts) - 
# related to income (continuous) non-linearly
df['num_purchases'] = df.apply(
    lambda row: np.random.poisson(np.log(row['income']/10000)),
    axis=1
)

# Discrete variable 2: Household size (counts) - 
# Not strongly correlated with other variables
df['household_size'] = np.random.poisson(3, n)
df['household_size'] = df['household_size'].clip(1, 10)

# --------- NON-LINEAR RELATIONSHIPS ---------

# Create a non-linear relationship between age and health_score
df['health_score'] = 100 - 0.01 * (df['age'] - 30) ** 2 + np.random.normal(0, 10, n)
df['health_score'] = df['health_score'].clip(0, 100).round(1)

# Create a non-linear relationship between income and vacation_days
df['vacation_days'] = 5 + 20 * (1 - np.exp(-df['income']/50000)) + np.random.normal(0, 3, n)
df['vacation_days'] = df['vacation_days'].clip(0, 45).round(0)

# --------- FEATURE ENCODING FOR CORRELATION ANALYSIS ---------

# Create a copy for visualization and correlation analysis
df_encoded = df.copy()

# One-hot encode nominal variables
df_encoded = pd.get_dummies(df_encoded, columns=['city', 'product_category', 'payment_method'])

# Ordinal encode education level
education_mapping = {'High School': 1, 'Associate': 2, 'Bachelor': 3, 'Master': 4, 'PhD': 5}
df_encoded['education_encoded'] = df_encoded['education'].map(education_mapping)

# Calculate and visualize correlation matrix for relevant numeric columns
numeric_cols = ['age', 'income', 'purchase_amount', 'screen_time', 'credit_score', 
                'num_purchases', 'household_size', 'health_score', 'vacation_days', 
                'satisfaction', 'education_encoded']

correlation_matrix = df_encoded[numeric_cols].corr()

# Display first 10 rows of the dataframe
print("Sample Dataframe (First 10 rows):")
print(df.head(10))

# Display the correlation matrix
print("\nCorrelation Matrix:")
print(correlation_matrix.round(2))

# Explain the variable types and relationships
print("\nVariable Types:")
print("Nominal Variables: city, product_category, payment_method")
print("Ordinal Variables: satisfaction, education")
print("Continuous Variables: age, income, purchase_amount, screen_time, credit_score, health_score")
print("Discrete Variables: num_purchases, household_size, vacation_days")

print("\nVariable Relationships:")
print("1. Nominal-Nominal correlations:")
print("   - city and product_category: Strong correlation (conditional probability)")
print("   - city and payment_method: No correlation (independent)")

print("\n2. Continuous-Nominal correlations:")
print("   - screen_time and product_category: Strong correlation")
print("   - credit_score and product_category: No correlation")

print("\n3. Continuous-Continuous correlations:")
print("   - income and education_encoded: Strong positive correlation")
print("   - credit_score and age: No correlation")

print("\n4. Linear Relationships:")
print("   - income and education_encoded: Linear positive relationship")
print("   - screen_time and product_category: Linear relationship within categories")

print("\n5. Non-Linear Relationships:")
print("   - age and health_score: Quadratic relationship (optimal health around age 30)")
print("   - income and vacation_days: Exponential relationship (diminishing returns)")
print("   - income and num_purchases: Logarithmic relationship")

# Return the final dataframe
df

In [None]:
df.dtypes

In [None]:
associations(df)

In [None]:
associations(df,num_num_assoc='spearman')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set random seed for reproducibility
np.random.seed(42)

# Sample size
n = 500

# Create a dataframe with three variables having non-linear relationships
df = pd.DataFrame()

# Variable 1: x - Independent variable (uniform distribution)
df['x'] = np.linspace(0, 10, n)

# Variable 2: y1 - Quadratic relationship with x
# Formula: y1 = 2x² - 5x + 3 + noise
df['y1'] = 2 * df['x']**2 - 5 * df['x'] + 3 + np.random.normal(0, 5, n)

# Variable 3: y2 - Sinusoidal relationship with x
# Formula: y2 = 10 * sin(x) + noise
df['y2'] = 10 * np.sin(df['x']) + np.random.normal(0, 2, n)

# Display the first 10 rows of the dataframe
print("Three-Variable Dataframe with Non-Linear Relationships (First 10 rows):")
print(df.head(10))

# Visualize the relationships
plt.figure(figsize=(12, 8))

# Plot the quadratic relationship
plt.subplot(2, 1, 1)
plt.scatter(df['x'], df['y1'], alpha=0.6)
plt.title('Quadratic Relationship: y1 = 2x² - 5x + 3 + noise')
plt.xlabel('x')
plt.ylabel('y1')

# Plot the sinusoidal relationship
plt.subplot(2, 1, 2)
plt.scatter(df['x'], df['y2'], alpha=0.6)
plt.title('Sinusoidal Relationship: y2 = 10 * sin(x) + noise')
plt.xlabel('x')
plt.ylabel('y2')

plt.tight_layout()

# Calculate Pearson correlation (linear correlation)
correlation_matrix = df.corr()
print("\nPearson Correlation Matrix (Linear Correlation):")
print(correlation_matrix.round(3))

# Spearman rank correlation (can detect monotonic non-linear relationships)
spearman_corr = df.corr(method='spearman')
print("\nSpearman Rank Correlation Matrix:")
print(spearman_corr.round(3))

# Calculate non-linear associations using correlation ratio (eta)
def correlation_ratio(categories, measurements):
    categories = np.asarray(categories)
    measurements = np.asarray(measurements)
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat) + 1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0, cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array, n_array)) / np.sum(n_array)
    numerator = np.sum(np.multiply(n_array, np.power(np.subtract(y_avg_array, y_total_avg), 2)))
    denominator = np.sum(np.power(np.subtract(measurements, y_total_avg), 2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = np.sqrt(numerator / denominator)
    return eta

# Bin x into categories for the correlation ratio
df['x_binned'] = pd.cut(df['x'], bins=10)

# Calculate correlation ratios
eta_x_y1 = correlation_ratio(df['x_binned'], df['y1'])
eta_x_y2 = correlation_ratio(df['x_binned'], df['y2'])

print("\nNon-linear Association (Correlation Ratio η):")
print(f"Correlation ratio between x and y1: {eta_x_y1:.3f}")
print(f"Correlation ratio between x and y2: {eta_x_y2:.3f}")

print("\nDescription of Non-Linear Relationships:")
print("1. x and y1: Quadratic relationship (parabola)")
print("   - Formula: y1 = 2x² - 5x + 3 + random noise")
print("   - The relationship follows a U-shaped curve")
print("   - Pearson correlation is not effective at capturing this relationship")
print("   - Correlation ratio (η) shows a strong non-linear association")

print("\n2. x and y2: Sinusoidal relationship")
print("   - Formula: y2 = 10 * sin(x) + random noise")
print("   - The relationship follows a wave pattern")
print("   - Pearson correlation is close to zero despite strong relationship")
print("   - Correlation ratio (η) better captures this non-linear pattern")

print("\nImportant Note:")
print("Standard correlation coefficients (Pearson) mainly detect linear relationships.")
print("The correlation ratio (η) and other non-linear measures are better at identifying non-linear patterns.")
print("Visual inspection through scatter plots is often the best way to identify non-linear relationships.")

# Return the dataframe
df

In [None]:
associations(df,num_num_assoc='spearman')

In [None]:
df.dtypes

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import shap

# Set random seed for reproducibility
np.random.seed(42)

# Sample size
n = 50000

# Create dataframe
df = pd.DataFrame()

###########################################
# PART 1: CREATING THE VARIABLES
###########################################

# Create categorical variables that are correlated and explain target similarly
# Categorical pair 1: correlated with each other and similarly predictive of target
occupations = ['Engineer', 'Teacher', 'Doctor', 'Artist', 'Accountant']
education_levels = ['Bachelor', 'Master', 'PhD', 'High School', 'Associate']

# Create base probabilities for occupation
occupation_probs = [0.25, 0.2, 0.15, 0.2, 0.2]
df['occupation'] = np.random.choice(occupations, size=n, p=occupation_probs)

# Create education with correlation to occupation
occupation_edu_probs = {
    'Engineer': [0.6, 0.3, 0.05, 0.025, 0.025],  # Engineers mostly have Bachelor's or Master's
    'Teacher': [0.4, 0.4, 0.1, 0.05, 0.05],      # Teachers mostly have Bachelor's or Master's
    'Doctor': [0.1, 0.3, 0.55, 0.025, 0.025],    # Doctors mostly have PhD or Master's
    'Artist': [0.3, 0.1, 0.05, 0.5, 0.05],       # Artists mostly have Bachelor's or High School
    'Accountant': [0.5, 0.2, 0.05, 0.05, 0.2]    # Accountants mostly have Bachelor's or Associate
}

df['education'] = df.apply(
    lambda row: np.random.choice(education_levels, p=occupation_edu_probs[row['occupation']]), 
    axis=1
)

# Create categorical variables that are uncorrelated but both predict target well
# Categorical pair 2: uncorrelated with each other but both predictive of target
device_types = ['Smartphone', 'Laptop', 'Tablet', 'Desktop', 'Smart Watch']
payment_methods = ['Credit Card', 'PayPal', 'Debit Card', 'Cash', 'Bank Transfer']

# Assign these independently (no correlation between them)
df['device_type'] = np.random.choice(device_types, size=n)
df['payment_method'] = np.random.choice(payment_methods, size=n)

# Create continuous variables that are correlated and explain target similarly
# Continuous pair 1: correlated with each other and similarly predictive of target
df['income'] = np.random.normal(70000, 20000, n)
# Create savings based on income (correlated)
df['savings'] = 0.2 * df['income'] + np.random.normal(5000, 10000, n)
df['savings'] = np.maximum(0, df['savings'])  # No negative savings

# Create continuous variables that are uncorrelated but both predict target well
# Continuous pair 2: uncorrelated with each other but both predictive of target
df['age'] = np.random.normal(40, 12, n)
df['age'] = np.clip(df['age'], 18, 80)

df['purchase_frequency'] = np.random.gamma(5, 2, n)  # Independent from age

###########################################
# PART 2: CREATING THE TARGET VARIABLE
###########################################

# Now create a binary target variable that's influenced by all these predictors
# But with different relationships (linear and non-linear)

# Function to convert probabilities to binary outcomes
def prob_to_binary(p):
    return np.random.binomial(1, p)

# Create target variable through a complex function of all predictors

# 1. Effects from correlated categorical variables (occupation and education)
occupation_target_probs = {
    'Engineer': 0.7,
    'Teacher': 0.6,
    'Doctor': 0.75,
    'Artist': 0.4,
    'Accountant': 0.55
}

education_target_probs = {
    'Bachelor': 0.6,
    'Master': 0.7,
    'PhD': 0.8,
    'High School': 0.4,
    'Associate': 0.5
}

cat_corr_effect = df.apply(
    lambda row: (occupation_target_probs[row['occupation']] + 
                education_target_probs[row['education']]) / 2,
    axis=1
)

# 2. Effects from uncorrelated categorical variables (device and payment)
device_target_probs = {
    'Smartphone': 0.6,
    'Laptop': 0.7,
    'Tablet': 0.5,
    'Desktop': 0.65,
    'Smart Watch': 0.45
}

payment_target_probs = {
    'Credit Card': 0.65,
    'PayPal': 0.7,
    'Debit Card': 0.55,
    'Cash': 0.4,
    'Bank Transfer': 0.6
}

cat_uncorr_effect = df.apply(
    lambda row: 0.4 * device_target_probs[row['device_type']] + 
                0.6 * payment_target_probs[row['payment_method']],
    axis=1
)

# 3. Effects from correlated continuous variables (income and savings)
# Linear effect
cont_corr_effect = 0.3 * ((df['income'] - df['income'].min()) / 
                         (df['income'].max() - df['income'].min())) + \
                  0.2 * ((df['savings'] - df['savings'].min()) / 
                         (df['savings'].max() - df['savings'].min()))

# 4. Effects from uncorrelated continuous variables (age and purchase frequency)
# Non-linear effect for age (quadratic - middle age has highest probability)
age_normalized = (df['age'] - df['age'].min()) / (df['age'].max() - df['age'].min())
age_effect = -4 * (age_normalized - 0.5)**2 + 1  # Peaks at age_normalized = 0.5

# Linear effect for purchase frequency
pf_normalized = (df['purchase_frequency'] - df['purchase_frequency'].min()) / \
                (df['purchase_frequency'].max() - df['purchase_frequency'].min())
pf_effect = 0.8 * pf_normalized

cont_uncorr_effect = 0.6 * age_effect + 0.4 * pf_effect

# Combine all effects and convert to probability
# Weight the effects from different variable groups
total_effect = (0.25 * cat_corr_effect + 
                0.25 * cat_uncorr_effect + 
                0.25 * cont_corr_effect + 
                0.25 * cont_uncorr_effect)

# Convert to binary target
df['target_prob'] = total_effect
df['target'] = df['target_prob'].apply(prob_to_binary)

###########################################
# PART 3: ANALYSIS OF RELATIONSHIPS
###########################################

# Encode categorical variables for correlation analysis
df_encoded = df.copy()
label_encoders = {}

for col in ['occupation', 'education', 'device_type', 'payment_method']:
    le = LabelEncoder()
    df_encoded[f'{col}_encoded'] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

# Calculate correlations
corr_columns = ['occupation_encoded', 'education_encoded', 'device_type_encoded', 
                'payment_method_encoded', 'income', 'savings', 'age', 
                'purchase_frequency', 'target']

corr_matrix = df_encoded[corr_columns].corr()

print("Correlation Matrix:")
print(corr_matrix.round(3))

# Verify our intended relationships
print("\nVerifying intended relationships:")

# 1. Correlation between occupation and education (should be high)
print(f"Correlation between occupation and education: {corr_matrix.loc['occupation_encoded', 'education_encoded']:.3f}")

# 2. Correlation between device_type and payment_method (should be low)
print(f"Correlation between device_type and payment_method: {corr_matrix.loc['device_type_encoded', 'payment_method_encoded']:.3f}")

# 3. Correlation between income and savings (should be high)
print(f"Correlation between income and savings: {corr_matrix.loc['income', 'savings']:.3f}")

# 4. Correlation between age and purchase_frequency (should be low)
print(f"Correlation between age and purchase_frequency: {corr_matrix.loc['age', 'purchase_frequency']:.3f}")

# 5. Correlation with target for all variables
print("\nCorrelation with target:")
for col in corr_columns[:-1]:  # Exclude target itself
    print(f"{col}: {corr_matrix.loc[col, 'target']:.3f}")

# Check target distribution
print(f"\nTarget distribution: {df['target'].value_counts(normalize=True)}")

###########################################
# PART 4: TRAIN-TEST SPLIT AND XGBOOST MODEL
###########################################

# Prepare data for modeling
X = df_encoded.drop(['target', 'target_prob'], axis=1)
y = df_encoded['target']

# We'll need to one-hot encode the categorical variables for XGBoost
X_processed = pd.get_dummies(X, columns=['occupation', 'education', 'device_type', 'payment_method'], drop_first=True)

# Remove the encoded columns that we created just for correlation analysis
X_processed = X_processed.drop(['occupation_encoded', 'education_encoded', 
                              'device_type_encoded', 'payment_method_encoded'], axis=1)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

# Train XGBoost model
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    learning_rate=0.1,
    max_depth=5,
    n_estimators=100,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)
y_prob = xgb_model.predict_proba(X_test)[:, 1]

# Evaluate the model
print("\nModel Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': xgb_model.feature_importances_
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

print("\nTop 10 Features by Importance:")
print(feature_importance.head(10))

###########################################
# PART 5: SHAP VALUES FOR FEATURE IMPORTANCE
###########################################

# Calculate SHAP values
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X_test)

# Print mean absolute SHAP values for top 10 features
mean_shap = pd.DataFrame({
    'Feature': X_test.columns,
    'SHAP_abs': np.abs(shap_values.values).mean(0)
})
mean_shap = mean_shap.sort_values('SHAP_abs', ascending=False)

print("\nTop 10 Features by Mean Absolute SHAP Value:")
print(mean_shap.head(10))

print("\nDataset Summary:")
print(f"1. Categorical variables that are correlated and explain target similarly: occupation, education")
print(f"2. Categorical variables that are uncorrelated but both explain target: device_type, payment_method")
print(f"3. Continuous variables that are correlated and explain target similarly: income, savings")
print(f"4. Continuous variables that are uncorrelated but both explain target: age, purchase_frequency")

print("\nResults explanation:")
print("1. The SHAP values show which features are most important for the model's predictions.")
print("2. Higher SHAP values indicate stronger impact on the model output.")
print("3. The direction of SHAP values (positive/negative) shows whether a feature increases or decreases the prediction.")

# Return the first few rows of the dataframe
df.head(10)

In [None]:
df.head()

In [None]:

df.drop(columns=[ 'target_prob'], inplace=True)

In [None]:
df.head()

In [None]:
associations(df)

In [None]:
category_features=['occupation', 'education', 'device_type', 'payment_method']
num_features=['income', 'savings', 'age', 'purchase_frequency']
selected_features=category_features+num_features

In [None]:
df[category_features] = df[category_features].astype('category')

In [None]:
df.head()

In [None]:
X = df_final[['x1','x3']]
y = df_final['target']
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [None]:
df_final.head()

In [None]:
# Build model with scikit-learn API
model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    learning_rate=0.01,
    max_depth=4,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method='hist',
    scale_pos_weight=sum(y_train==0)/sum(y_train==1),
    enable_categorical=True,  # Enable categorical features
    random_state=42,
   # early_stopping_rounds=20,
)

# Train with early stopping
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],  
    verbose=20
)


In [None]:
# Initialize SHAP explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer(X_test)

In [None]:
# Step 5: Print average SHAP values
mean_abs_shap = pd.DataFrame({
    "Feature": X_test.columns,
    "Mean |SHAP|": np.abs(shap_values.values).mean(axis=0)
}).sort_values(by="Mean |SHAP|", ascending=False)

print(mean_abs_shap)

In [None]:
# Step 5: Print average SHAP values
mean_abs_shap = pd.DataFrame({
    "Feature": X_test.columns,
    "Mean |SHAP|": np.abs(shap_values.values).mean(axis=0)
}).sort_values(by="Mean |SHAP|", ascending=False)

print(mean_abs_shap)

In [None]:
from sklearn.metrics import roc_auc_score

# Get predicted probabilities for the positive class
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Calculate ROC AUC
auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {auc:.4f}")


In [None]:
from sklearn.metrics import roc_auc_score

# Get predicted probabilities for the positive class
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Calculate ROC AUC
auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {auc:.4f}")


In [None]:
# Extract SHAP values into DataFrame
shap_df = pd.DataFrame(shap_values.values, columns=shap_values.feature_names)


In [None]:
shap_df

In [None]:
associations(df_final)

In [None]:
associations(shap_df)

In [None]:
df_final.head()

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
import time

print("Starting dataset generation...")
start_time = time.time()

# Define parameters
n_samples = 300000  # 3 lakh records
n_features = 1000  # Total features
n_informative = 20  # Features with strong predictive relationship
n_redundant = 10  # Correlated features
n_categorical = 200  # Number of categorical features

# Generate the base dataset with numeric features
print("Generating base classification dataset...")
X, y = make_classification(
    n_samples=n_samples,
    n_features=n_features,
    n_informative=n_informative,
    n_redundant=n_redundant,
    n_repeated=0,
    n_classes=2,
    random_state=42,
    shuffle=True,
    class_sep=1.0,  # Increase separation for stronger predictive relationship
)

# Convert to DataFrame
print("Converting to DataFrame...")
column_names = [f'num_{i}' for i in range(n_features)]
df = pd.DataFrame(X, columns=column_names)

# Add binary target column
df['target'] = y

# Convert some numeric features to categorical
print("Converting some features to categorical...")
categorical_indices = np.random.choice(n_features, n_categorical, replace=False)
categorical_cols = []

for idx in categorical_indices:
    col_name = f'num_{idx}'
    categorical_cols.append(col_name)
    
    # Determine number of categories (between 3 and 10)
    n_categories = np.random.randint(3, 11)
    
    # Convert numeric to categorical
    df[col_name] = pd.qcut(
        df[col_name], 
        q=n_categories, 
        labels=[f'cat_{j}' for j in range(n_categories)],
        duplicates='drop'
    )
    
    # Convert to category dtype
    df[col_name] = df[col_name].astype('category')

# Rename categorical columns
for i, col in enumerate(categorical_cols):
    new_name = f'cat_{i}'
    df = df.rename(columns={col: new_name})

# Add some datetime features
print("Adding datetime features...")
base_date = pd.Timestamp('2020-01-01')
date_range = pd.date_range(start=base_date, periods=n_samples, freq='30s')
df['date_feature'] = date_range
df['month'] = df['date_feature'].dt.month.astype('category')
df['day'] = df['date_feature'].dt.day.astype('category')
df['hour'] = df['date_feature'].dt.hour.astype('category')

# Add some ID-like features
print("Adding ID features...")
df['id'] = np.arange(n_samples)
df['uuid'] = [f'uuid-{i:09d}' for i in range(n_samples)]

# Generate dataset statistics
print("\nDataset Statistics:")
print(f"Number of samples: {n_samples}")
print(f"Number of features: {df.shape[1] - 1}")  # excluding target
print(f"Number of categorical features: {len(categorical_cols) + 3}")  # +3 for month, day, hour
print(f"Target distribution:\n{df['target'].value_counts(normalize=True)}")

# Display memory usage
memory_usage = df.memory_usage(deep=True).sum() / (1024 * 1024)
print(f"Dataset memory usage: {memory_usage:.2f} MB")

# Display first few rows
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Display data types
print("\nData types:")
print(df.dtypes.value_counts())

# Check for null values
print(f"\nNull values in dataset: {df.isnull().sum().sum()}")

elapsed_time = time.time() - start_time
print(f"\nTotal time to generate dataset: {elapsed_time:.2f} seconds")

# Save a small sample for demonstration
sample_df = df.sample(n=min(1000, n_samples), random_state=42)
sample_df.to_csv('synthetic_dataset_sample.csv', index=False)
print("Saved sample to 'synthetic_dataset_sample.csv'")

# Return the DataFrame
print("Dataset generation complete!")
df.head(1)

In [None]:
df.head(2)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import shap

In [None]:
for col in credit_risk_df.select_dtypes(include='object').columns:
    credit_risk_df[col] = credit_risk_df[col].astype('category')

In [None]:
credit_risk_df.dtypes

In [None]:
credit_risk_df.head()

In [None]:
X = credit_risk_df.loc[:, 'income':'application_channel']
y = credit_risk_df['default']
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
y.value_counts(normalize=True)

In [None]:
from example_uv.func import shap_feature_selection

In [None]:
X.columns

In [None]:
selected_features,importance_df =shap_feature_selection(train_data=credit_risk_df,feature_names=X.columns,target_name='default',verbose=True,
                                                        test_size=0.3,random_state=42,use_train_for_shap=False)

In [None]:
importance_df

In [None]:
f=importance_df[importance_df['Cumulative_Importance']<0.9999]['Feature'].values

In [None]:
f

In [None]:
c = associations(credit_risk_df[f])

In [None]:
c['corr']

In [None]:
# Replace 1s (self-correlations) with NaN
corr_matrix_no_diag = c['corr'].where(~np.eye(c['corr'].shape[0], dtype=bool))

# Find max and min correlation values
max_corr = corr_matrix_no_diag.max().max()
min_corr = corr_matrix_no_diag.min().min()

print(f"Max correlation (excluding 1.0): {max_corr}")
print(f"Min correlation: {min_corr}")

In [None]:
c

In [None]:
#X = df.loc[:, 'num_0':'num_999']
X = credit_risk_df[f]
y = credit_risk_df['default']
# Split data
X_train, X_test, y_train, y_test = train_test_split(X[f], y, test_size=0.4, random_state=42, stratify=y)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# Build model with scikit-learn API
model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    learning_rate=0.01,
    max_depth=4,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method='hist',
    scale_pos_weight=sum(y_train==0)/sum(y_train==1),
    enable_categorical=True,  # Enable categorical features
    random_state=42,
   # early_stopping_rounds=20,
)

# Train with early stopping
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],  
    verbose=20
)

In [None]:
# Initialize SHAP explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer(X_test)


In [None]:
shap.summary_plot(shap_values)
#

In [None]:
#Step 5: Print average SHAP values
mean_abs_shap = pd.DataFrame({
    "Feature": X_test.columns,
    "Mean |SHAP|": np.abs(shap_values.values).mean(axis=0)
}).sort_values(by="Mean |SHAP|", ascending=False)

print(mean_abs_shap)


In [None]:
from sklearn.metrics import roc_auc_score

# Get predicted probabilities for the positive class
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Calculate ROC AUC
auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {auc:.4f}")



In [None]:
# Extract SHAP values into DataFrame
shap_df = pd.DataFrame(shap_values.values, columns=shap_values.feature_names)
cc=associations(shap_df)

In [None]:
cc['corr']

In [None]:
len(shap_values.feature_names)

In [None]:
# Replace 1s (self-correlations) with NaN
corr_matrix_no_diag = cc['corr'].where(~np.eye(cc['corr'].shape[0], dtype=bool))

# Find max and min correlation values
max_corr = corr_matrix_no_diag.max().max()
min_corr = corr_matrix_no_diag.min().min()

print(f"Max correlation (excluding 1.0): {max_corr}")
print(f"Min correlation: {min_corr}")

In [None]:
cluster_correlations(associations(shap_df,plot=False)['corr'])

In [None]:
len([11, 26,  4, 14, 20, 24, 16, 22,  1,  6, 25, 13,  3, 21, 17,  9,  7,
         8, 12, 23, 27, 18, 15,  5, 29, 28, 10, 19,  2])

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
import random
from scipy import stats

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Function to generate synthetic data
def generate_credit_risk_data(n_samples=50000, default_rate=0.10):
    """
    Generate synthetic credit risk data with specified characteristics.
    
    Parameters:
    - n_samples: Number of data points to generate
    - default_rate: Proportion of default cases (target=1)
    
    Returns:
    - DataFrame with features and target variable
    """
    # Calculate number of defaults
    n_defaults = int(n_samples * default_rate)
    n_non_defaults = n_samples - n_defaults
    
    # Generate base data with 10 informative features
    X, y = make_classification(
        n_samples=n_samples,
        n_features=10,  # We'll expand to 50 later
        n_informative=8,  # Truly informative
        n_redundant=2,   # Correlated features
        n_classes=2,
        weights=[1-default_rate, default_rate],
        random_state=42
    )
    
    # Convert to DataFrame
    feature_names = [f"feature_{i+1}" for i in range(X.shape[1])]
    df = pd.DataFrame(X, columns=feature_names)
    
    # Rename the first 10 features to meaningful names
    # Highly predictive numeric variables (6)
    df.rename(columns={
        'feature_1': 'income',
        'feature_2': 'debt_to_income_ratio',
        'feature_3': 'credit_score',
        'feature_4': 'loan_amount', 
        'feature_5': 'interest_rate',
        'feature_6': 'age',
    }, inplace=True)
    
    # Create a highly correlated feature with income
    df['total_assets'] = df['income'] * np.random.normal(3, 0.2, n_samples) + np.random.normal(0, 0.5, n_samples)
    
    # Transform features to more realistic ranges
    df['income'] = (df['income'] * 20000 + 50000).clip(10000, 250000)
    df['debt_to_income_ratio'] = (df['debt_to_income_ratio'] + 3) / 6  # Scale to 0-1 range
    df['debt_to_income_ratio'] = (df['debt_to_income_ratio'] * 0.6 + 0.1).clip(0.05, 0.8)  
    df['credit_score'] = (df['credit_score'] * 150 + 650).clip(300, 850).astype(int)
    df['loan_amount'] = (df['loan_amount'] * 50000 + 100000).clip(5000, 500000)
    df['interest_rate'] = (df['interest_rate'] + 3) / 6 * 15 + 2  # Interest rate between 2% and 17%
    df['age'] = (df['age'] * 20 + 40).clip(18, 85).astype(int)
    
    # Highly predictive categorical variables (4)
    # Feature 7: employment_status (categorical)
    employment_statuses = ['Employed', 'Self-Employed', 'Unemployed', 'Retired']
    # Make employment more predictive - separate probability arrays for default vs non-default
    employed_prob_default = [0.3, 0.2, 0.4, 0.1]
    employed_prob_nondefault = [0.7, 0.2, 0.05, 0.05]
    
    # Create array to store employment status
    employment_status = []
    for i in range(n_samples):
        if y[i] == 1:
            employment_status.append(np.random.choice(employment_statuses, p=employed_prob_default))
        else:
            employment_status.append(np.random.choice(employment_statuses, p=employed_prob_nondefault))
    
    df['employment_status'] = employment_status
    
    # Feature 8: education_level (categorical)
    education_levels = ['High School', 'Bachelor', 'Master', 'PhD', 'Other']
    # Make education more predictive - separate probability arrays
    education_prob_default = [0.5, 0.3, 0.1, 0.05, 0.05]
    education_prob_nondefault = [0.2, 0.4, 0.3, 0.08, 0.02]
    
    # Create array to store education level
    education_level = []
    for i in range(n_samples):
        if y[i] == 1:
            education_level.append(np.random.choice(education_levels, p=education_prob_default))
        else:
            education_level.append(np.random.choice(education_levels, p=education_prob_nondefault))
    
    df['education_level'] = education_level
    
    # Feature 9: loan_purpose (categorical)
    loan_purposes = ['Home', 'Auto', 'Education', 'Personal', 'Business', 'Debt Consolidation']
    # Make loan_purpose more predictive
    purpose_prob_default = [0.1, 0.15, 0.2, 0.2, 0.25, 0.1]
    purpose_prob_nondefault = [0.3, 0.2, 0.1, 0.1, 0.1, 0.2]
    
    # Create array to store loan purpose
    loan_purpose = []
    for i in range(n_samples):
        if y[i] == 1:
            loan_purpose.append(np.random.choice(loan_purposes, p=purpose_prob_default))
        else:
            loan_purpose.append(np.random.choice(loan_purposes, p=purpose_prob_nondefault))
    
    df['loan_purpose'] = loan_purpose
    
    # Feature 10: has_previous_defaults (categorical but binary)
    # Make previous defaults highly predictive
    has_previous_defaults = []
    for i in range(n_samples):
        if y[i] == 1:
            # 70% of defaulters have previous defaults
            has_previous_defaults.append(np.random.choice([1, 0], p=[0.7, 0.3]))
        else:
            # 10% of non-defaulters have previous defaults
            has_previous_defaults.append(np.random.choice([1, 0], p=[0.1, 0.9]))
    
    df['has_previous_defaults'] = [('Yes' if val == 1 else 'No') for val in has_previous_defaults]
    
    # Create a correlated categorical variable with employment_status
    # payment_history (correlated with employment_status)
    payment_history_map = {
        'Employed': np.array(['Excellent', 'Good', 'Fair', 'Poor']),
        'Self-Employed': np.array(['Good', 'Fair', 'Fair', 'Poor']),
        'Unemployed': np.array(['Fair', 'Poor', 'Poor', 'Poor']), 
        'Retired': np.array(['Excellent', 'Good', 'Fair', 'Poor'])
    }
    
    # Create probabilities for payment_history based on employment status
    payment_probs = {
        'Employed': [0.5, 0.3, 0.15, 0.05],
        'Self-Employed': [0.3, 0.4, 0.2, 0.1],
        'Unemployed': [0.1, 0.2, 0.3, 0.4],
        'Retired': [0.4, 0.3, 0.2, 0.1]
    }
    
    payment_history = []
    for status in df['employment_status']:
        payment_history.append(np.random.choice(payment_history_map[status], p=payment_probs[status]))
    
    df['payment_history'] = payment_history
    
    # Add remaining 40 less predictive features (mix of numeric and categorical)
    # Numeric features (30)
    for i in range(1, 31):
        # Generate less predictive numeric features
        feature_name = f'numeric_feature_{i}'
        if i <= 5:  # First 5 slightly more predictive than the rest
            feature_values = np.random.normal(0, 1, n_samples) + y * np.random.uniform(0.1, 0.3)
        else:  # Remaining 25 features are mostly noise
            feature_values = np.random.normal(0, 1, n_samples) + y * np.random.uniform(0, 0.1)
            
        # Apply different transformations to make features diverse
        if i % 4 == 0:
            # Exponential-like features (e.g., transaction amounts)
            feature_values = np.exp(feature_values * 0.5) * 100
        elif i % 4 == 1:
            # Percentage-like features (e.g., utilization rates)
            feature_values = stats.norm.cdf(feature_values) * 100
        elif i % 4 == 2:
            # Count-like features (e.g., number of inquiries)
            feature_values = np.abs(feature_values * 5).astype(int)
        # else leave as standard normal
            
        df[feature_name] = feature_values
    
    # Categorical features (10)
    categorical_vars = [
        ('marital_status', ['Single', 'Married', 'Divorced', 'Widowed']),
        ('housing_status', ['Own', 'Mortgage', 'Rent', 'Other']),
        ('job_industry', ['Technology', 'Healthcare', 'Finance', 'Education', 'Manufacturing', 'Retail', 'Other']),
        ('state', ['CA', 'NY', 'TX', 'FL', 'IL', 'PA', 'OH', 'GA', 'Other']),
        ('credit_card_type', ['Visa', 'Mastercard', 'Amex', 'Discover', 'None']),
        ('num_dependents', [0, 1, 2, 3, 4, '5+']),
        ('months_at_current_job', ['<6', '6-12', '1-3 years', '3-5 years', '5+ years']),
        ('has_cosigner', ['Yes', 'No']),
        ('account_type', ['Checking', 'Savings', 'Both', 'None']),
        ('application_channel', ['Online', 'In-person', 'Phone', 'Mail'])
    ]
    
    for i, (feature_name, categories) in enumerate(categorical_vars):
        # For first 3 categorical variables, make them slightly predictive
        if i < 3:
            # Different probability distributions based on target
            p_default = np.random.dirichlet(np.ones(len(categories)) * 2)
            p_non_default = np.random.dirichlet(np.ones(len(categories)) * 2)
            
            # Ensure some difference between distributions
            max_idx = np.argmax(p_default)
            p_default[max_idx] += 0.1
            p_default = p_default / sum(p_default)
            
            # Choose categories based on target
            cat_values = []
            for target_val in y:
                if target_val == 1:
                    cat_values.append(np.random.choice(categories, p=p_default))
                else:
                    cat_values.append(np.random.choice(categories, p=p_non_default))
        else:
            # For the rest, almost no predictive power
            cat_values = np.random.choice(categories, size=n_samples)
            
        df[feature_name] = cat_values
    
    # Add target variable
    df['default'] = y
    
    return df

# Generate the dataset
credit_risk_df = generate_credit_risk_data(n_samples=50000, default_rate=0.10)

# Verify the default rate
default_rate = credit_risk_df['default'].mean()
print(f"Default rate in the dataset: {default_rate:.4f}")

# Check correlation between income and total_assets (should be highly correlated)
income_assets_corr = credit_risk_df['income'].corr(credit_risk_df['total_assets'])
print(f"Correlation between income and total_assets: {income_assets_corr:.4f}")

# Check correlation between employment_status and payment_history (categorical correlation)
crosstab = pd.crosstab(credit_risk_df['employment_status'], credit_risk_df['payment_history'])
print("\nCrosstab of employment_status and payment_history:")
print(crosstab)

# Show information about the dataset
print("\nDataset information:")
print(f"Total rows: {len(credit_risk_df)}")
print(f"Total columns: {len(credit_risk_df.columns)}")
print(f"Default cases: {credit_risk_df['default'].sum()}")
print(f"Non-default cases: {len(credit_risk_df) - credit_risk_df['default'].sum()}")

# Show a sample of the data
print("\nSample of the generated dataset:")
print(credit_risk_df.head())

# Feature importance analysis
# We'll use correlation for numeric features and chi-square for categorical
from scipy.stats import chi2_contingency

# Analyze numeric features
numeric_features = credit_risk_df.select_dtypes(include=['float64', 'int64']).columns
numeric_features = [col for col in numeric_features if col != 'default']

print("\nCorrelation of numeric features with default:")
correlations = {}
for col in numeric_features:
    corr = credit_risk_df[col].corr(credit_risk_df['default'])
    correlations[col] = abs(corr)

sorted_numeric = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)
for feature, corr in sorted_numeric[:15]:  # Show top 15
    print(f"{feature}: {corr:.4f}")

# Analyze categorical features
categorical_features = credit_risk_df.select_dtypes(include=['object']).columns
print("\nChi-square statistics for categorical features:")
chi2_values = {}

for col in categorical_features:
    contingency = pd.crosstab(credit_risk_df[col], credit_risk_df['default'])
    chi2, p, dof, expected = chi2_contingency(contingency)
    chi2_values[col] = (chi2, p)

sorted_categorical = sorted(chi2_values.items(), key=lambda x: x[1][0], reverse=True)
for feature, (chi2, p) in sorted_categorical:
    print(f"{feature}: Chi2={chi2:.2f}, p-value={p:.6f}")

# Save to CSV
credit_risk_df.to_csv('credit_risk_dataset.csv', index=False)
print("\nDataset saved to 'credit_risk_dataset.csv'")

# Summary of highly predictive features
print("\nHighly predictive features:")
print("Numeric: income, debt_to_income_ratio, credit_score, loan_amount, interest_rate, age")
print("Correlated numeric pair: income and total_assets")
print("Categorical: employment_status, education_level, loan_purpose, has_previous_defaults")
print("Correlated categorical pair: employment_status and payment_history")

In [None]:
credit_risk_df.head(1)

In [None]:
associations(credit_risk_df)

In [None]:
cluster_correlations(associations(credit_risk_df,plot=False)['corr'])

In [None]:
len([4, 1, 1, 2, 2, 6, 1, 2, 5, 4, 4, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 3, 3, 5, 5, 5, 5, 5, 5, 5, 3])

In [None]:

from sklearn.datasets import make_classification
import pandas as pd


# Generate the dataset
X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=5,      # Informative features
    n_redundant=5,        # Redundant features (linear combinations of informative)
    n_classes=2,
    random_state=10
)

# Create feature names like "feature_0", "feature_1", ..., "feature_9"
feature_names = [f"feature_{i}" for i in range(X.shape[1])]

# Convert to pandas DataFrame
X = pd.DataFrame(X, columns=feature_names)

# Optionally convert target to Series
y = pd.Series(y, name="target")

In [None]:
X= pd.DataFrame(X)
y= pd.DataFrame(y)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=0)

In [None]:
corrmat = X_train.corr()

In [None]:
import numpy as np

# Create a mask to avoid duplicate pairs and self-correlation
mask = np.triu(np.ones(corrmat.shape), k=1).astype(bool)
high_corr_pairs = (
    corrmat.where(mask)
    .stack()
    .reset_index()
    .rename(columns={0: "correlation", "level_0": "feature_1", "level_1": "feature_2"})
)

# Filter strong correlations
high_corr_pairs = high_corr_pairs[abs(high_corr_pairs["correlation"]) > 0.80]
print(high_corr_pairs)


In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from typing import Dict, List, Set


def find_correlation_groups(
    X: pd.DataFrame,
    corr_threshold: float = 0.8
) -> Dict[int, List[str]]:
    """
    Find groups of correlated features based on a correlation threshold.

    Parameters
    ----------
    X : pd.DataFrame
        DataFrame containing only numeric feature columns.
    corr_threshold : float, optional (default=0.8)
        Absolute correlation threshold above which features are considered correlated.

    Returns
    -------
    Dict[int, List[str]]
        Dictionary where each key is a group ID and each value is a list of correlated feature names.
        Features not correlated with any other above the threshold are returned in individual groups.
    """
    corrmat = X.corr().abs()
    corr_pairs = corrmat.unstack()
    filtered_pairs = corr_pairs[
        (corr_pairs > corr_threshold) & (corr_pairs < 1)
    ].reset_index()
    filtered_pairs.columns = ['feature1', 'feature2', 'corr']

    correlation_groups: Dict[int, Set[str]] = defaultdict(set)
    features_assigned: Set[str] = set()

    # First pass: create groups based on correlated pairs
    for _, row in filtered_pairs.iterrows():
        f1, f2 = row['feature1'], row['feature2']
        group_found = False
        for group in correlation_groups.values():
            if f1 in group or f2 in group:
                group.update([f1, f2])
                group_found = True
                break
        if not group_found:
            group_id = len(correlation_groups)
            correlation_groups[group_id] = {f1, f2}
        features_assigned.update([f1, f2])

    # Second pass: merge overlapping groups
    merged = True
    while merged:
        merged = False
        keys = list(correlation_groups.keys())
        for i in range(len(keys)):
            for j in range(i + 1, len(keys)):
                g1, g2 = keys[i], keys[j]
                if g1 in correlation_groups and g2 in correlation_groups:
                    if correlation_groups[g1] & correlation_groups[g2]:
                        correlation_groups[g1].update(correlation_groups[g2])
                        del correlation_groups[g2]
                        merged = True
                        break
            if merged:
                break

    # Add non-correlated features
    ungrouped_features = set(X.columns) - features_assigned
    for feature in ungrouped_features:
        correlation_groups[len(correlation_groups)] = {feature}

    return {k: sorted(list(v)) for k, v in correlation_groups.items()}

In [None]:
correlated_groups = find_correlation_groups(X_train, corr_threshold=0.8)

In [None]:
correlated_groups

In [None]:
# Define data manually
impdata = [
    {"feature_name": "feature_4", "importance": 25},
    {"feature_name": "feature_5", "importance": 30},
    {"feature_name": "feature_8", "importance": 35}
]

# Create DataFrame
impdata = pd.DataFrame(impdata)

In [None]:
import pandas as pd
import numpy as np

def select_best_features_from_groups(correlated_groups, feature_importance_df,
                                     feature_name_col='feature_name',
                                     feature_importance_col='importance'):
    """
    Select the best feature from each correlation group based on feature importance
    
    Parameters:
    -----------
    correlated_groups : dict
        Dictionary where keys are group IDs and values are lists of feature names
    feature_importance_df : pandas DataFrame
        DataFrame with at least two columns for feature names and importance values
    feature_name_col : str
        Name of the feature column to be used for importance comparison
    feature_importance_col : str
        Name of the importance column to be used for comparison
        
    Returns:
    --------
    result_df : pandas DataFrame
        DataFrame with columns: 'feature', 'group', 'importance', 'keep'
    selected_features : list
        List of features to keep
    """
    # Create a dictionary for quick lookup of feature importance
    importance_dict = pd.Series(
        feature_importance_df[feature_importance_col].values,
        index=feature_importance_df[feature_name_col]
    ).to_dict()
    
    # Create a list to hold rows for the result dataframe
    result_rows = []
    
    # Process each correlation group
    for group_id, features in correlated_groups.items():
        # Get importance for each feature in the group
        group_features_data = [(f, importance_dict.get(f, float('nan'))) for f in features]
        
        # For groups with multiple features, find the one with highest importance
        if len(features) > 1:
            # Find feature with max importance
            best_feature, _ = max(group_features_data, key=lambda x: x[1])
        else:
            # If only one feature, keep it
            best_feature = features[0]
        
        # Add all features from this group to results
        for feature, importance in group_features_data:
            result_rows.append({
                'feature': feature,
                'group': group_id,
                'importance': importance,
                'keep': feature == best_feature
            })
    
    # Create result dataframe from rows
    result_df = pd.DataFrame(result_rows)
    
    # Sort by group and importance (descending)
    if not result_df.empty:
        result_df = result_df.sort_values(['group', 'importance'], ascending=[True, False])
    
    # Get list of features to keep
    selected_features = result_df.loc[result_df['keep'], 'feature'].tolist()
    
    return result_df, selected_features

In [None]:
 result_df, selected_features = select_best_features_from_groups(
     correlated_groups, impdata)
 

In [None]:
result_df

In [None]:
selected_features

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

# Set random seed for reproducibility
np.random.seed(42)

# Sample size
n = 1000

# Create two uncorrelated predictors
x1 = np.random.normal(0, 1, n)
x2 = np.random.normal(0, 1, n)

# Ensure they are uncorrelated by generating independent random variables
# We'll check correlation later to confirm

# Create target variable that depends equally on both predictors
# Using a logistic function to create binary outcomes
# z = 0.5*x1 + 0.5*x2 + noise
z = 0.5 * x1 + 0.5 * x2 + np.random.normal(0, 0.5, n)
probabilities = 1 / (1 + np.exp(-z))  # Sigmoid function
y = (probabilities > 0.5).astype(int)  # Binary outcome

# Create DataFrame
df = pd.DataFrame({
    'predictor1': x1,
    'predictor2': x2,
    'target': y
})

# Check correlation between predictors
correlation = np.corrcoef(x1, x2)[0, 1]
print(f"Correlation between predictor1 and predictor2: {correlation:.6f}")

# Verify equal contribution using logistic regression
X = df[['predictor1', 'predictor2']]
y = df['target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train logistic regression
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Print coefficients
print("\nLogistic Regression Coefficients:")
for feature, coef in zip(X.columns, model.coef_[0]):
    print(f"{feature}: {coef:.6f}")

# Model performance
y_pred = model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Visualize the dataset
plt.figure(figsize=(12, 10))

# Plot 1: Scatter plot of predictors colored by target
plt.subplot(2, 2, 1)
sns.scatterplot(x='predictor1', y='predictor2', hue='target', data=df, alpha=0.6)
plt.title('Scatter Plot: Predictor1 vs Predictor2')

# Plot 2: Distribution of predictor1 by target class
plt.subplot(2, 2, 2)
sns.histplot(data=df, x='predictor1', hue='target', bins=30, kde=True, element="step")
plt.title('Distribution of Predictor1 by Target Class')

# Plot 3: Distribution of predictor2 by target class
plt.subplot(2, 2, 3)
sns.histplot(data=df, x='predictor2', hue='target', bins=30, kde=True, element="step")
plt.title('Distribution of Predictor2 by Target Class')

# Plot 4: Correlation heatmap
plt.subplot(2, 2, 4)
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap')

plt.tight_layout()
plt.show()

# Preview the dataset
print("\nDataset Preview:")
print(df.head())

# Dataset statistics
print("\nDataset Statistics:")
print(df.describe())

In [None]:
from scorecardutils.feature_selection import shap_feature_selection,find_correlation_groups,select_best_features_from_corr_groups

In [None]:
df.head()

In [None]:
## Define target and features names
target = 'target'
features = df.drop(columns=[target]).columns.tolist()

In [None]:
selected_features,importance_df,_ =shap_feature_selection(train_data=df,feature_names=features,target_name=target,verbose=True,
                                                        test_size=0.3,random_state=42,use_train_for_shap=False)

In [None]:
selected_features

In [None]:
importance_df

In [None]:
_,importance_df,shapDF =shap_feature_selection(train_data=df,feature_names=selected_features,target_name=target,
                                               verbose=False,
                                                        split_data=False,random_state=42,
                                                        create_shap_df=True)

In [None]:

shapDF

In [None]:
importance_df

In [None]:
correlated_groups = find_correlation_groups(shapDF, corr_threshold=0.8)

In [None]:
correlated_groups

In [None]:
shapDF.corr()