In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
term_life = pd.read_csv('TermLife.csv')

# These are the relevant columns we are checking
insurance_columns = ['FACE', 'FACECVLIFEPOLICIES', 'CASHCVLIFEPOLICIES', 'BORROWCVLIFEPOL', 'NETVALUE']

# Create a new column 'PURCHASED' where 1 indicates life insurance was purchased and 0 otherwise
term_life['PURCHASED'] = (term_life[insurance_columns].sum(axis=1) > 0).astype(int)

# Check the data types and the first few rows to understand the structure
print(term_life.info())
print(term_life.head())

# Define the variable types based on your description
binary_vars = ['GENDER', 'SGENDER', 'BORROWCVLIFEPOL', 'NETVALUE']  # Binary variables
nominal_vars = ['MARSTAT', 'ETHNICITY', 'SMARSTAT']  # Nominal variables
target_vars = ['PURCHASED']
continuous_vars = [col for col in term_life.columns if col not in binary_vars + nominal_vars + target_vars]  # All others are continuous
term_life_encoded = pd.get_dummies(term_life, columns=nominal_vars, drop_first=True)

# Convert binary variables to 'category' type
for var in binary_vars:
    term_life[var] = term_life[var].astype('category')

# Convert nominal variables to 'category' type
for var in nominal_vars:
    term_life[var] = term_life[var].astype('category')
    
# Separate features and target variable
X = term_life_encoded.drop('PURCHASED', axis=1)  # Features
y = term_life_encoded['PURCHASED']  # Target variable (binary)

# Split the data into train and test sets (70/30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the continuous variables only
scaler = StandardScaler()
X_train_cont_scaled = scaler.fit_transform(X_train[continuous_vars])
X_test_cont_scaled = scaler.transform(X_test[continuous_vars])

# Combine scaled continuous variables with unscaled binary and nominal variables
X_train_scaled = pd.DataFrame(X_train_cont_scaled, columns=continuous_vars, index=X_train.index)
X_train_scaled = pd.concat([X_train_scaled, X_train.drop(columns=continuous_vars)], axis=1)

X_test_scaled = pd.DataFrame(X_test_cont_scaled, columns=continuous_vars, index=X_test.index)
X_test_scaled = pd.concat([X_test_scaled, X_test.drop(columns=continuous_vars)], axis=1)

# Print results
print(term_life_encoded.head())
print(X_train_scaled.head())
print(y_train.head())

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Ensure all columns in X_train_with_constant are numeric
# If any column is non-numeric, you can check its type and handle it accordingly
X_train_with_constant = pd.DataFrame(X_train_scaled, columns=X_train.columns)

# Check if all columns are numeric
non_numeric_cols = X_train_with_constant.select_dtypes(exclude=['number']).columns
print("Non-numeric columns:", non_numeric_cols)

# If there are non-numeric columns, convert them or drop them (you may convert them to dummy variables if appropriate)
if len(non_numeric_cols) > 0:
    X_train_with_constant = X_train_with_constant.drop(columns=non_numeric_cols)

# Add a constant for the intercept (if not already added)
X_train_with_constant['Intercept'] = 1

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X_train_with_constant.columns

# Calculate VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X_train_with_constant.values, i) for i in range(X_train_with_constant.shape[1])]

# Display the VIF values
print(vif_data)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score

# Convert numpy arrays back to DataFrames, using the original feature names from X_train
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Ensure that VIF is calculated only on the feature columns (no intercept)
vif_data = vif_data[vif_data['Feature'] != 'Intercept']

# Identify variables with VIF > 5 or infinite VIF
high_vif_vars = vif_data[vif_data['VIF'] > 5]['Feature'].tolist()

# Now, drop the high VIF variables from both training and test sets
X_train_reduced = X_train_scaled_df.drop(columns=high_vif_vars)
X_test_reduced = X_test_scaled_df.drop(columns=high_vif_vars)

# Logistic regression without regularization
logistic_model = LogisticRegression(penalty= None, max_iter=1000, solver='lbfgs').fit(X_train_reduced, y_train)

# Print coefficients with corresponding feature names
coefficients = pd.DataFrame({
    'Feature': X_train_reduced.columns,  # Feature names
    'Coefficient': logistic_model.coef_[0]  # Coefficients (assuming binary classification)
})

print(coefficients)

# Predictions
y_pred_logistic = logistic_model.predict(X_test_reduced)


# Calculate log-loss (deviance)
log_loss_train = log_loss(y_train, logistic_model.predict_proba(X_train_reduced))
log_loss_test = log_loss(y_test, logistic_model.predict_proba(X_test_reduced))

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_logistic)

print("Training Log Loss (Deviance):", 2 * log_loss_train)
print("Test Log Loss (Deviance):", 2 * log_loss_test)
print("Accuracy:", accuracy)

import numpy as np

# Compute null deviance
def null_deviance(y):
    N = len(y)
    p = y.mean()
    null_log_likelihood = N * (np.log(p) * p + np.log(1 - p) * (1 - p))
    return -2 * null_log_likelihood  # Null Deviance

# Deviance (already computed from log_loss)
deviance_train = 2 * log_loss_train
deviance_test = 2 * log_loss_test

# Null deviance for training and test sets
null_deviance_train = null_deviance(y_train)
null_deviance_test = null_deviance(y_test)

# Compute R^2 for training and test sets
r2_train = 1 - (deviance_train / null_deviance_train)
r2_test = 1 - (deviance_test / null_deviance_test)

print("Training R²:", r2_train)
print("Test R²:", r2_test)



In [None]:
from sklearn.linear_model import LogisticRegressionCV

# Lasso Logistic Regression (L1)
lasso_model = LogisticRegressionCV(cv=10, penalty='l1', solver='saga', max_iter=10000).fit(X_train_reduced, y_train)
coefficient_lasso = pd.DataFrame({
    'Feature': X_train_reduced.columns,  # Feature names
    'Coefficient': lasso_model.coef_[0]  # Coefficients (assuming binary classification)
})
print("Ridge Coefficients:", coefficient_lasso)
print("Best C for LASSO:", lasso_model.C_)

# Ridge Logistic Regression (L2)
ridge_model = LogisticRegressionCV(cv=10, penalty='l2', solver='saga', max_iter=10000).fit(X_train_reduced, y_train)
coefficient_ridge = pd.DataFrame({
    'Feature': X_train_reduced.columns,  # Feature names
    'Coefficient': ridge_model.coef_[0]  # Coefficients (assuming binary classification)
})
print("Ridge Coefficients:", coefficient_ridge)
print("Best C for Ridge:", ridge_model.C_)



from sklearn.metrics import accuracy_score, precision_score, log_loss, r2_score

# Predict probabilities for test data using LASSO and Ridge models
y_test_pred_probs_lasso = lasso_model.predict_proba(X_test_reduced)[:, 1]
y_test_pred_probs_ridge = ridge_model.predict_proba(X_test_reduced)[:, 1]

# Calculate deviance (negative log-likelihood * -2) for LASSO and Ridge
deviance_lasso = 2 * log_loss(y_test, y_test_pred_probs_lasso)
deviance_ridge = 2 * log_loss(y_test, y_test_pred_probs_ridge)

# Predict class labels for accuracy and precision calculation
y_test_pred_lasso = lasso_model.predict(X_test_reduced)
y_test_pred_ridge = ridge_model.predict(X_test_reduced)

# Calculate accuracy and precision for LASSO and Ridge
accuracy_lasso = accuracy_score(y_test, y_test_pred_lasso)
accuracy_ridge = accuracy_score(y_test, y_test_pred_ridge)
precision_lasso = precision_score(y_test, y_test_pred_lasso)
precision_ridge = precision_score(y_test, y_test_pred_ridge)

# Out-of-sample R^2 for LASSO and Ridge
r2_lasso = r2_score(y_test, y_test_pred_probs_lasso)
r2_ridge = r2_score(y_test, y_test_pred_probs_ridge)

performance_metrics = {
    'Deviance LASSO': deviance_lasso,
    'Deviance Ridge': deviance_ridge,
    'Accuracy LASSO': accuracy_lasso,
    'Accuracy Ridge': accuracy_ridge,
    'Precision LASSO': precision_lasso,
    'Precision Ridge': precision_ridge,
    'Out-of-sample R^2 LASSO': r2_lasso,
    'Out-of-sample R^2 Ridge': r2_ridge
}

print(performance_metrics)


In [None]:
# Plot learning curves for scikit-learn models (Logistic Regression, Lasso, Ridge)
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn.metrics import log_loss

# Function to plot learning curves
def plot_learning_curve(estimator, X, y, title, cv = 10, scoring = 'neg_log_loss'):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, scoring=scoring, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10)
    )
    
    train_mean = -train_scores.mean(axis=1)
    train_std = train_scores.std(axis=1)
    test_mean = -test_scores.mean(axis=1)
    test_std = test_scores.std(axis=1)
    
    plt.figure()
    plt.plot(train_sizes, train_mean, 'o-', color="r", label="Training Loss")
    plt.plot(train_sizes, test_mean, 'o-', color="g", label="Validation Loss")
    
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="r", alpha=0.1)
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="g", alpha=0.1)
    
    plt.title(title)
    plt.xlabel("Training Size")
    plt.ylabel("Negative Log Loss")
    plt.legend(loc="best")
    plt.grid(True)
    plt.show()
    
# Logistic regression without regularization
plot_learning_curve(logistic_model, X_train_reduced, y_train, title="Logistic Regression Learning Curve")

# LASSO Logistic Regression
plot_learning_curve(lasso_model, X_train_reduced, y_train, title="LASSO Learning Curve")

# Ridge Logistic Regression
plot_learning_curve(ridge_model, X_train_reduced, y_train, title="Ridge Learning Curve")

