In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
import numpy as np
import pandas as pd
import cvxpy as cp

In [58]:
def generate_synthetic_fico_data(n=10000):
    np.random.seed(42)  # For reproducibility
    
    # Simulate Payment History
    # A mix of positive and negative factors
    payment_history = np.random.normal(700, 100, n) - np.random.choice([0, 20, 50, 100], n, p=[0.7, 0.15, 0.1, 0.05])
    
    # Simulate Amounts Owed
    # Influence of credit utilization and total balance
    amounts_owed = 850 - np.random.beta(2, 5, n) * 500  # Beta distribution for utilization skew
    
    # Simulate Length of Credit History
    # Older accounts generally mean higher scores
    length_of_credit_history = np.random.choice(range(300, 850), n, replace=True)
    
    # Simulate Credit Mix
    # More diverse credit types yield higher scores
    credit_mix = np.random.choice(range(600, 850), n, replace=True)
    
    # Simulate New Credit
    # Recent credit activities lower the score
    new_credit = 850 - np.random.gamma(2, 100, n)
    
    # Combine into DataFrame
    data = {
        'Payment History': payment_history,
        'Amounts Owed': amounts_owed,
        'Length of Credit History': length_of_credit_history,
        'Credit Mix': credit_mix,
        'New Credit': new_credit
    }
    df = pd.DataFrame(data) # Divide by 10 to scale down the values
    
    # Calculate FICO Score with given weights
    weights = {'Payment History': 0.45, 'Amounts Owed': 0.20, 'Length of Credit History': 0.15, 'Credit Mix': 0.10, 'New Credit': 0.10}
    df['FICO Score'] = df.apply(lambda row: sum(row[feature] * weight for feature, weight in weights.items()), axis=1)
    
    return df


def w(beta, gamma):
    """
    Applies the transformation to beta.
    
    Args:
    beta (float): The input weight.
    gamma (float): The gamma parameter for the transformation.

    Returns:
    float: The transformed weight.
    """
    return np.exp(-(-np.log(beta))**gamma)


def behavioral(weights, gamma):
    """
    Modifies the weights according to the specified formula.
    
    Args:
    weights (np.array): Original model weights.
    gamma (float): The gamma parameter for the transformation.

    Returns:
    np.array: Modified weights reflecting behavioral adjustments.
    """
    sorted_indices = np.argsort(weights)  # Sort indices of weights
    sorted_weights = weights[sorted_indices]  # Sort weights
    transformed_weights = [w(beta, gamma) for beta in sorted_weights]
    
    perceived_weights = np.zeros_like(weights)
    for i in range(len(weights)):
        sum_current = sum(transformed_weights[i:])
        sum_next = sum(transformed_weights[i+1:]) if i+1 < len(weights) else 0
        perceived_weights[sorted_indices[i]] = sum_current - sum_next

    return perceived_weights

In [59]:
# Generate the dataset
df = generate_synthetic_fico_data()
fico_df = df.copy()

In [60]:
# Sigmoid function to calculate probability
def sigmoid(x):
    return 1 / (1 + np.exp(-0.1 * (x - 650)))

# Calculate the approval probability for each individual
fico_df['Approval Probability'] = sigmoid(fico_df['FICO Score'])

# Assign "true" labels based on approval probability being above a random threshold
# This introduces randomness in the assignment, making some borderline cases get approved or denied
fico_df['True Label'] = (fico_df['Approval Probability'] > np.random.rand(fico_df.shape[0])).astype(int)

fico_df.head(10)

Unnamed: 0,Payment History,Amounts Owed,Length of Credit History,Credit Mix,New Credit,FICO Score,Approval Probability,True Label
0,749.671415,706.285266,690,670,741.096754,723.218866,0.99934,1
1,686.17357,568.537433,624,685,719.431848,656.528778,0.657659,1
2,764.768854,782.821399,559,609,535.475954,699.007859,0.992614,1
3,852.302986,779.838422,500,609,774.948796,752.898908,0.999966,1
4,676.584663,802.699392,848,638,781.528487,734.155825,0.999779,1
5,676.586304,703.907235,417,664,776.518005,651.847084,0.546046,0
6,857.921282,652.638288,813,712,750.692035,784.811438,0.999999,1
7,756.743473,794.64831,530,657,256.070511,670.271276,0.883616,1
8,653.052561,732.876715,316,843,742.269311,646.375927,0.410377,1
9,734.256004,744.433418,795,636,514.639393,713.615825,0.998276,1


In [64]:
import cvxpy as cp

def logistic_regression_with_sum_constraint(X, y):
    """
    Train logistic regression model with the constraint that coefficients sum to 1.
    Args:
    X (pd.DataFrame): Feature dataframe.
    y (pd.Series): Target vector.

    Returns:
    tuple: (coefficients, threshold)
    """
    n_samples, n_features = X.shape
    
    # Variables
    beta = cp.Variable(n_features)
    intercept = cp.Variable()

    # Logistic loss
    logits = X.values @ beta + intercept
    log_likelihood = cp.sum(
        cp.multiply(y.values, logits) - cp.logistic(logits)
    )

    # Objective and constraints
    objective = cp.Maximize(log_likelihood)
    constraints = [cp.sum(beta) == 1, beta >= 0, beta <= 1]

    # Problem
    problem = cp.Problem(objective, constraints)
    problem.solve()

    return beta.value, intercept.value

# # Example usage to get beta and threshold
# X = pd.DataFrame([[0.1, 0.2], [0.4, 0.5], [0.7, 0.8], [0.3, 0.6], [0.5, 0.9]], columns=['Feature1', 'Feature2'])
# y = pd.Series([0, 1, 0, 0, 1])

# beta, threshold = logistic_regression_with_sum_constraint(X, y)
# print("Beta coefficients:", beta)
# print("Threshold:", threshold)


In [135]:
from sklearn.model_selection import train_test_split

# Example dataset loading
# Assuming 'FICO Score' is not used as a feature directly in the model training
X = fico_df[['Payment History', 'Amounts Owed', 'Length of Credit History', 'Credit Mix', 'New Credit']]  # model features
y = fico_df['True Label']  # the target variable

# Splitting the dataset
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=42)

# Train the logistic regression model
beta, threshold = logistic_regression_with_sum_constraint(X_train, y_train)
print("Beta coefficients:", beta)
print("Threshold:", threshold)

# print a few rows of the dataset with predictions and true labels
predictions = (X_test.values @ beta + threshold) > 0  
results = pd.DataFrame({'Prediction': predictions, 'True Label': y_test})
results.head(10)

Beta coefficients: [0.43061011 0.20517574 0.14545218 0.11374165 0.10502032]
Threshold: -652.8484759970745


Unnamed: 0,Prediction,True Label
6252,True,1
4684,True,1
1731,True,1
4742,True,1
4521,True,1
6340,True,1
576,True,1
5202,True,1
6363,True,1
439,False,0


In [136]:
# feature names
feature_names = X.columns

# feature weights
coefficients = beta

# Printing feature weights
print("Feature weights:")
for feature, coef in zip(feature_names, coefficients):
    print(f"{feature}: {coef:.4f}")

    # print perceived weights
perceived_weights = behavioral(coefficients, gamma=0.5)
print("\nPerceived Feature weights:")
for feature, coef in zip(feature_names, perceived_weights):
    print(f"{feature}: {coef:.4f}")

Feature weights:
Payment History: 0.4306
Amounts Owed: 0.2052
Length of Credit History: 0.1455
Credit Mix: 0.1137
New Credit: 0.1050

Perceived Feature weights:
Payment History: 0.3994
Amounts Owed: 0.2841
Length of Credit History: 0.2495
Credit Mix: 0.2289
New Credit: 0.2229


# Optimization

In [137]:
import cvxpy as cp
import numpy as np

def classifier(x, beta, threshold):
    """
    Logistic regression classifier.
    
    Args:
    x : np.array
        Feature vector of the agent.
    beta : np.array
        Weights of the classifier.
    threshold : float
        Threshold for classification.
    
    Returns:
    int
        1 if the agent is classified as positive, 0 otherwise.
    """
    prediction = (x @ beta + threshold) > 0
    return int(prediction)

def cost_function(z, x_0):
    """
    Cost function based on norm-2 distance.
    
    Args:
    z : np.array
        Manipulated feature vector.
    x_0 : np.array
        Original feature vector.
    
    Returns:
    float
        Cost of manipulation.
    """
    return np.linalg.norm(z - x_0)


def agent_optimization(x_0, beta, threshold, R, mode='actual', gamma=0.5):
    """
    Solve the agent's optimization problem.
    
    Args:
    x_0 : np.array
        Original feature vector.
    beta : np.array
        Weights of the classifier.
    threshold : float
        Threshold for classification.
    R : float
        Reward for being classified as positive.
    mode : str
        Mode of optimization ('actual' or 'perceived').
    gamma : float
        Gamma parameter for behavioral.
    
    Returns:
    np.array
        Optimized feature vector if conditions are met, otherwise the original feature vector.
    """
    if mode == 'actual':
        weights = beta
    elif mode == 'perceived':
        weights = behavioral(beta, gamma)
    else:
        raise ValueError("Mode should be 'actual' or 'perceived'.")

    z = cp.Variable(len(x_0))
    cost = cp.norm(z - x_0, 2)
    eps = 1e-6
    prediction = cp.sum(cp.multiply(weights, z)) + threshold
    constraints = [prediction >= eps]
    
    prob = cp.Problem(cp.Minimize(cost), constraints)
    prob.solve()
    
    optimized_features = z.value

    manipulation_cost = cost_function(optimized_features, x_0)
    final_prediction = classifier(optimized_features, weights, threshold)

    if R - manipulation_cost >= 0 and final_prediction == 1:
        return optimized_features
    else:
        return x_0


In [138]:
# # Define the initial parameters as pandas Series and numpy arrays
# x_0 = np.array([0.5, 1.0, 0.5])  # Original features of the agent
# beta = np.array([0.5, 0.3, 0.2])  # Weights of the classifier
# threshold = -1  # Classification threshold
# R = 2.0  # Reward for being classified as positive

# # Perform the classification before manipulation
# initial_classification = classifier(x_0, beta, threshold)
# print(f"Initial classification: {initial_classification}")

# # Solve the agent's optimization problem
# optimized_features = agent_optimization(x_0, beta, threshold, R, mode='actual')
# optimized_features_perceived = agent_optimization(x_0, beta, threshold, R, mode='perceived', gamma=0.5)

# # Perform the classification after manipulation
# final_classification = classifier(optimized_features, beta, threshold)
# print(f"Final classification: {final_classification}")

# final_classification_perceived = classifier(optimized_features_perceived, beta, threshold)
# print(f"Final classification (perceived): {final_classification_perceived}")

# # Print the optimized features and the cost of manipulation
# manipulation_cost = cost_function(optimized_features, x_0)
# print(f"\nOptimized features: {optimized_features}")
# print(f"Cost of manipulation: {manipulation_cost}")

# manipulation_cost_perceived = cost_function(optimized_features_perceived, x_0)
# print(f"\nOptimized features (perceived): {optimized_features_perceived}")
# print(f"Cost of manipulation (perceived): {manipulation_cost_perceived}")

In [141]:
def update_dataset(X, y, beta, threshold, R, mode='actual', gamma=0.5):
    """
    Update the dataset by optimizing features for agents classified as 0.
    
    Args:
    X : pd.DataFrame
        Feature matrix.
    y : pd.Series
        Target variable.
    beta : np.array
        Weights of the classifier.
    threshold : float
        Threshold for classification.
    R : float
        Reward for being classified as positive.
    mode : str
        Mode of optimization ('actual' or 'perceived').
    gamma : float
        Gamma parameter for behavioral.
    
    Returns:
    pd.DataFrame
        Updated feature matrix.
    pd.Series
        Updated target variable.
    """
    X_new = X.copy()
    for i in range(len(X)):
        if classifier(X.iloc[i], beta, threshold) == 0:
            optimized_features = agent_optimization(X.iloc[i], beta, threshold, R, mode, gamma)
            X_new.iloc[i] = optimized_features
    return X_new, y

def iterative_optimization(X, y, mode='actual', gamma=0.5, max_iter=20, R=2.0, tol=1e-4):
    """
    Iteratively optimize features for agents and retrain the model.
    
    Args:
    X : pd.DataFrame
        Feature matrix.
    y : pd.Series
        Target variable.
    mode : str
        Mode of optimization ('actual' or 'perceived').
    gamma : float
        Gamma parameter for behavioral.
    max_iter : int
        Maximum number of iterations.
    R : float
        Reward for being classified as positive.
    tol : float
        Tolerance for convergence.
    
    Returns:
    np.array
        Final weights of the logistic regression model.
    float
        Final threshold for classification.
    """
    beta, threshold = logistic_regression_with_sum_constraint(X, y)
    
    for iteration in range(max_iter):
        X_new, y_new = update_dataset(X, y, beta, threshold, R, mode, gamma)
        beta_new, threshold_new = logistic_regression_with_sum_constraint(X_new, y_new)
        
        if np.linalg.norm(beta_new - beta) < tol:
            print(f"Converged after {iteration + 1} iterations.")
            break
        
        beta, threshold = beta_new, threshold_new
    
    return beta, threshold

In [143]:
# # Run the iterative optimization process with perceived weights
# beta_final_perceived, threshold_final_perceived = iterative_optimization(X, y, mode='perceived')

# # Print the final weights and threshold
# print(f"Final weights (perceived): {beta_final_perceived}")
# print(f"Final threshold (perceived): {threshold_final_perceived}")

# # Run the iterative optimization process with actual weights
# beta_final_actual, threshold_final_actual = iterative_optimization(X, y, mode='actual')

# # Print the final weights and threshold
# print(f"Final weights (actual): {beta_final_actual}")
# print(f"Final threshold (actual): {threshold_final_actual}")