In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.metrics import log_loss

In [2]:
# Import the data
df = pd.read_csv('./data/Exam Cases.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ID              10000 non-null  int64 
 1   Antidepressant  10000 non-null  object
 2   Female          10000 non-null  object
 3   Age>30          10000 non-null  object
 4   Weight Gain     10000 non-null  object
 5   Remission       10000 non-null  object
 6   Psychosis       10000 non-null  object
dtypes: int64(1), object(6)
memory usage: 547.0+ KB


# Data Preparation

In [3]:
# Drop ID column
df = df.drop(columns=['ID'])

In [4]:
# Recode Yes to 1 and No to 0
# Define a function to recode 'Yes' to 1 and 'No' to 0
def recode(value):
    if value == 'Yes':
        return 1
    else:
        return 0

# Apply the function to all columns of the DataFrame
df = df.applymap(recode)

# Convert columns to numeric
df = df.apply(pd.to_numeric)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Antidepressant  10000 non-null  int64
 1   Female          10000 non-null  int64
 2   Age>30          10000 non-null  int64
 3   Weight Gain     10000 non-null  int64
 4   Remission       10000 non-null  int64
 5   Psychosis       10000 non-null  int64
dtypes: int64(6)
memory usage: 468.9 KB


In [5]:
# Reorder columns
cols_in_order = ['Age>30',
                 'Female',
                 'Psychosis',
                 'Antidepressant',
                 'Weight Gain',
                 'Remission']

df = df[cols_in_order]
df.head()

Unnamed: 0,Age>30,Female,Psychosis,Antidepressant,Weight Gain,Remission
0,1,1,0,1,1,1
1,1,1,0,1,0,1
2,1,1,0,1,0,0
3,1,0,1,1,0,1
4,0,1,0,1,1,1


# Data Analysis for the Exam

## Predictors of Remission

In [6]:
# First we need to see if Antidepressant is a predictor of Remission

# Create dependent variable
y = df['Remission']

# Create independent variables
X = df.drop(columns=['Remission'])

In [7]:
# Create LassoCV model
lasso_cv = LassoCV(cv=5)

# Perform cross-validation
lasso_cv.fit(X, y)

In [8]:
# Print the intercept
print("Intercept:", lasso_cv.intercept_)

# Get selected features with abs value of coefficients greater than 0.05
selected_features = [(feature, coef) for feature, coef in zip(X.columns, lasso_cv.coef_) if abs(coef) > 0.001]

# Print selected feature names and coefficients
print("\nSelected features with coefficients:")
for feature, coef in selected_features:
    print(f"{feature}: {coef}")

# Print the minimum lambda value selected by LassoCV
print("\nMinimum lambda (selected alpha):", lasso_cv.alpha_)

# Calculate McFadden's R-squared
# Get predicted probabilities
y_pred = lasso_cv.predict(X)

# Calculate log-likelihood of the model
log_likelihood_model = -log_loss(y, y_pred)

# Calculate log-likelihood of the null model
null_model_probs = np.full_like(y_pred, y.mean())
log_likelihood_null_model = -log_loss(y, null_model_probs)

# Calculate McFadden's R-squared
mcfadden_r2 = 1 - (log_likelihood_model / log_likelihood_null_model)

print("\nMcFadden's R-squared:", mcfadden_r2)

Intercept: 0.5953912252475335

Selected features with coefficients:
Age>30: 0.02083675185567725
Female: 0.020536570962658613
Antidepressant: 0.0944106610848428
Weight Gain: 0.03679032005612803

Minimum lambda (selected alpha): 0.0009883722112735917

McFadden's R-squared: 0.012614535876050659


## Predictors for Age

In [None]:
# Since Age happened first, there is no regression for this. This will be a root node

## Predictors for Gender

In [None]:
# y is Female
y = df['Female']

# Limit columns to those that proceed y
X_cols = ['Age>30'
          ]
X = df[X_cols]

# Create LassoCV model
lasso_cv = LassoCV(cv=5)

# Perform cross-validation
lasso_cv.fit(X, y)

# Print the intercept
print("Intercept:", lasso_cv.intercept_)

# Get selected features with abs value of coefficients greater than 0.05
selected_features = [(feature, coef) for feature, coef in zip(X.columns, lasso_cv.coef_) if abs(coef) > 0.01]

# Print selected feature names and coefficients
print("\nSelected features with coefficients:")
for feature, coef in selected_features:
    print(f"{feature}: {coef}")

# Print the minimum lambda value selected by LassoCV
print("\nMinimum lambda (selected alpha):", lasso_cv.alpha_)

# Calculate McFadden's R-squared
# Get predicted probabilities
y_pred = lasso_cv.predict(X)

# Calculate log-likelihood of the model
log_likelihood_model = -log_loss(y, y_pred)

# Calculate log-likelihood of the null model
null_model_probs = np.full_like(y_pred, y.mean())
log_likelihood_null_model = -log_loss(y, null_model_probs)

# Calculate McFadden's R-squared
mcfadden_r2 = 1 - (log_likelihood_model / log_likelihood_null_model)

print("\nMcFadden's R-squared:", mcfadden_r2)

# Predictors of Antidepressant

In [None]:
# y is Antidepressant
y = df['Antidepressant']

# Limit columns to those that proceed y
X_cols = ['Age>30',
          'Female',
          'Psychosis'
          ]
X = df[X_cols]

# Create LassoCV model
lasso_cv = LassoCV(cv=5)

# Perform cross-validation
lasso_cv.fit(X, y)

# Print the intercept
print("Intercept:", lasso_cv.intercept_)

# Get selected features with abs value of coefficients greater than 0.05
selected_features = [(feature, coef) for feature, coef in zip(X.columns, lasso_cv.coef_) if abs(coef) > 0.01]

# Print selected feature names and coefficients
print("\nSelected features with coefficients:")
for feature, coef in selected_features:
    print(f"{feature}: {coef}")

# Print the minimum lambda value selected by LassoCV
print("\nMinimum lambda (selected alpha):", lasso_cv.alpha_)

# Calculate McFadden's R-squared
# Get predicted probabilities
y_pred = lasso_cv.predict(X)

# Calculate log-likelihood of the model
log_likelihood_model = -log_loss(y, y_pred)

# Calculate log-likelihood of the null model
null_model_probs = np.full_like(y_pred, y.mean())
log_likelihood_null_model = -log_loss(y, null_model_probs)

# Calculate McFadden's R-squared
mcfadden_r2 = 1 - (log_likelihood_model / log_likelihood_null_model)

print("\nMcFadden's R-squared:", mcfadden_r2)

## Predictors for Weight Gain

In [None]:
# y is Weight Gain
y = df['Weight Gain']

# Limit columns to those that proceed y
X_cols = ['Age>30',
          'Female',
          'Psychosis',
          'Antidepressant'
          ]
X = df[X_cols]

# Create LassoCV model
lasso_cv = LassoCV(cv=5)

# Perform cross-validation
lasso_cv.fit(X, y)

# Print the intercept
print("Intercept:", lasso_cv.intercept_)

# Get selected features with abs value of coefficients greater than 0.05
selected_features = [(feature, coef) for feature, coef in zip(X.columns, lasso_cv.coef_) if abs(coef) > 0.01]

# Print selected feature names and coefficients
print("\nSelected features with coefficients:")
for feature, coef in selected_features:
    print(f"{feature}: {coef}")

# Print the minimum lambda value selected by LassoCV
print("\nMinimum lambda (selected alpha):", lasso_cv.alpha_)

# Calculate McFadden's R-squared
# Get predicted probabilities
y_pred = lasso_cv.predict(X)

# Calculate log-likelihood of the model
log_likelihood_model = -log_loss(y, y_pred)

# Calculate log-likelihood of the null model
null_model_probs = np.full_like(y_pred, y.mean())
log_likelihood_null_model = -log_loss(y, null_model_probs)

# Calculate McFadden's R-squared
mcfadden_r2 = 1 - (log_likelihood_model / log_likelihood_null_model)

print("\nMcFadden's R-squared:", mcfadden_r2)

# Marginal Probabilities

In [None]:
# Age
p_age = round(df['Age>30'].sum() / df.shape[0] * 100)
print(f"p_age: {p_age}")

# Gender
p_gender = round(df['Female'].sum() / df.shape[0] * 100)
print(f"p_gender: {p_gender}")

# Counterfactual Model

In [None]:
# y is Weight Gain
y = df['Weight Gain']

# Limit columns to those that proceed y
X_cols = ['Age>30',
          'Female',
          'Psychosis'
          # 'Antidepressant' # removed
          ]
X = df[X_cols]

# Create LassoCV model
lasso_cv = LassoCV(cv=5)

# Perform cross-validation
lasso_cv.fit(X, y)

# Print the intercept
print("Intercept:", lasso_cv.intercept_)

# Get selected features with abs value of coefficients greater than 0.05
selected_features = [(feature, coef) for feature, coef in zip(X.columns, lasso_cv.coef_) if abs(coef) > 0.01]

# Print selected feature names and coefficients
print("\nSelected features with coefficients:")
for feature, coef in selected_features:
    print(f"{feature}: {coef}")

# Print the minimum lambda value selected by LassoCV
print("\nMinimum lambda (selected alpha):", lasso_cv.alpha_)

# Calculate McFadden's R-squared
# Get predicted probabilities
y_pred = lasso_cv.predict(X)

# Calculate log-likelihood of the model
log_likelihood_model = -log_loss(y, y_pred)

# Calculate log-likelihood of the null model
null_model_probs = np.full_like(y_pred, y.mean())
log_likelihood_null_model = -log_loss(y, null_model_probs)

# Calculate McFadden's R-squared
mcfadden_r2 = 1 - (log_likelihood_model / log_likelihood_null_model)

print("\nMcFadden's R-squared:", mcfadden_r2)

# Try interaction pairs

In [None]:
# Create dependent variable
y = df['Remission']

# Create independent variables
X = df.drop(columns=['Remission'])

In [None]:
# Pairwise and triplet variable performance and coefficients
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Create LassoCV model
lasso_cv = LassoCV(cv=5)

# Perform cross-validation
lasso_cv.fit(X_poly, y)

# Print the intercept
print("Intercept:", lasso_cv.intercept_)

# Get selected features with abs value of coefficients greater than 0.05
selected_features = [(feature, coef) for feature, coef in zip(X.columns, lasso_cv.coef_) if abs(coef) > 0.01]

# Print selected feature names and coefficients
print("\nSelected features with coefficients:")
for feature, coef in selected_features:
    print(f"{feature}: {coef}")

# Print the minimum lambda value selected by LassoCV
print("\nMinimum lambda (selected alpha):", lasso_cv.alpha_)

# Calculate McFadden's R-squared
# Get predicted probabilities
y_pred = lasso_cv.predict(X_poly)

# Calculate log-likelihood of the model
log_likelihood_model = -log_loss(y, y_pred)

# Calculate log-likelihood of the null model
null_model_probs = np.full_like(y_pred, y.mean())
log_likelihood_null_model = -log_loss(y, null_model_probs)

# Calculate McFadden's R-squared
mcfadden_r2 = 1 - (log_likelihood_model / log_likelihood_null_model)

print("\nMcFadden's R-squared:", mcfadden_r2)

In [None]:
# Note that adding interaction terms made the McFadden R2 lower