# Logistic Regression Analysis

This notebook demonstrates how to analyze logistic regression models, calculate p-values for coefficients, and compute R-squared and adjusted R-squared values.


In [None]:
# Logistic Regression Analysis Notebook

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy.stats import chi2
from sklearn.utils.class_weight import compute_class_weight


### Defining the Logistic Regression Analyzer Class


In [None]:
class LogisticRegressionAnalyzer:
    def __init__(self, model=None):
        if model is None:
            self.model = LogisticRegression()
        else:
            self.model = model

    def fit(self, X, y):
        self.X = X
        self.y = y
        self.model.fit(X, y)

    def calculate_p_values(self):
        try:
            X_columns = self.X.columns
            
            X = np.array(self.X, dtype=np.float64)
            n = X.shape[0]
            p = X.shape[1]  # number of predictors
            X_design = np.hstack([np.ones((n, 1)), X])
            
            pred_probs = self.model.predict_proba(X)[:, 1]
            V = np.diag(pred_probs * (1 - pred_probs))
            X_design_T = X_design.T
            cov_matrix = np.linalg.inv(X_design_T @ V @ X_design)
            standard_errors = np.sqrt(np.diag(cov_matrix))
            
            wald_stats = np.hstack([self.model.intercept_, self.model.coef_.flatten()]) / standard_errors
            p_values = chi2.sf(wald_stats**2, df=1)
            
            feature_names = ['Intercept'] + list(X_columns)
            coef_df = pd.DataFrame({
                'Feature': feature_names,
                'Coefficient': np.hstack([self.model.intercept_, self.model.coef_.flatten()]),
                'P-Value': p_values
            })
            
            log_likelihood_fitted = np.sum(self.model.predict_log_proba(self.X)[:, 1])
            
            null_model = LogisticRegression()
            null_model.fit(np.ones((n, 1)), self.y)  # Fit a model with only an intercept
            log_likelihood_null = np.sum(null_model.predict_log_proba(np.ones((n, 1)))[:, 1])
            
            r_squared = 1 - (log_likelihood_fitted / log_likelihood_null)
            adj_r_squared = 1 - ((1 - r_squared) * (n - 1) / (n - p - 1))
            
            return coef_df, r_squared, adj_r_squared
        except Exception as e:
            print(f"An error occurred during p-value calculation: {e}")
            return None, None, None


### Example Usage


In [None]:
# Load your data here
data = pd.read_csv('../data/your_data.csv')
X = data.drop('target', axis=1)
y = data['target']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize the analyzer
analyzer = LogisticRegressionAnalyzer()
analyzer.fit(X_scaled, y)

# Calculate p-values and R-squared
p_values_df, r_squared, adj_r_squared = analyzer.calculate_p_values()

if p_values_df is not None:
    print(p_values_df)
    print(f"R-squared: {r_squared}")
    print(f"Adjusted R-squared: {adj_r_squared}")
