In [43]:
pip install statsmodels


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [41]:
import scipy.stats
import statsmodels.api as sm
import os
import pandas as pd
import numpy as np
import stats

In [28]:
# Set random seed.
np.random.seed(42)

# Read the processed CSV file.
df = pd.read_csv("ms_data.csv")

# Convert visit_date to datetime.
df['visit_date'] = pd.to_datetime(df['visit_date'])

#  Sort by patient_id and visit_date.
df = df.sort_values(by = ['patient_id', 'visit_date'])

# Read insurance types from `insurance.lst`.
with open('insurance.lst', 'r') as f:
    insurance_types = [line.strip() for line in f.readlines()]

# Randomly assign (but keep consistent per patient_id).
unique_patients = df['patient_id'].unique()
patient_insurance_map = {patient_id: np.random.choice(insurance_types) for patient_id in unique_patients}
df['insurance_type'] = df['patient_id'].map(patient_insurance_map)

# Generate visit costs based on insurance type. Different plans have different effects on cost.
base_costs = {'Medicare': 100,
    'Medicaid': 200,
    'Private': 50,
    'Other': 500
}

# Add random variation.
variation_factor = 0.2 # 20% variation
df['visit_cost'] = df['insurance_type'].map(base_costs) * (1 + np.random.uniform(-variation_factor, variation_factor, len(df))).round(3)

# Set appropriate data types.
df['patient_id'] = df['patient_id'].astype(str)
df['education_level'] = df['education_level'].astype(str)
df['insurance_type'] = df['insurance_type'].astype(str)

df.head()

Unnamed: 0,patient_id,visit_date,age,education_level,walking_speed,insurance_type,visit_cost
0,P0001,2020-01-23,34.28,Bachelors,4.4,Private,53.95
1,P0001,2020-04-16,34.51,Bachelors,4.19,Private,50.7
2,P0001,2020-07-03,34.72,Bachelors,4.71,Private,46.2
3,P0001,2020-10-15,35.0,Bachelors,4.86,Private,56.3
4,P0001,2020-12-29,35.21,Bachelors,4.5,Private,53.7


In [32]:
df['visit_date'] = pd.to_datetime(df['visit_date'])
df['patient_id'] = df['patient_id'].astype(str)
df['education_level'] = df['education_level'].astype(str)
df['age'] = df['age'].astype(float)
df['walking_speed'] = df['walking_speed'].astype(float)
df['visit_cost'] = df['visit_cost'].astype(float)

print(df.dtypes)

print(f"Number of missing values in each column: {df.isnull().sum()}")
print(f"Number of rows with at least one missing value: {df.isnull().any(axis=1).sum()}")

patient_id                 object
visit_date         datetime64[ns]
age                       float64
education_level            object
walking_speed             float64
insurance_type             object
visit_cost                float64
dtype: object
Number of missing values in each column: patient_id         0
visit_date         0
age                0
education_level    0
walking_speed      0
insurance_type     0
visit_cost         0
dtype: int64
Number of rows with at least one missing value: 0


In [33]:
# 1. Analyze walking speed:
#    - Multiple regression with education and age (report coeffcients and confidence intervals)
#    - Account for repeated measures
#    - Test for significant trends

# Prepare the data for a simple multiple regression with age and education_level as predictors
X = df[['age']]
X = pd.get_dummies(df['education_level'], drop_first=True).join(X)  # Convert education_level to dummy variables
X = sm.add_constant(X)  # Add intercept

### print(X.dtypes)
### print(X.head())

y = df['walking_speed']

# Ensure that X and y are purely numeric arrays
X = X.astype(float) 
y = y.astype(float)

# Fit an OLS model
est = sm.OLS(y, X).fit()

# Display the summary of the model
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:          walking_speed   R-squared:                       0.807
Model:                            OLS   Adj. R-squared:                  0.807
Method:                 Least Squares   F-statistic:                 1.613e+04
Date:                Tue, 12 Nov 2024   Prob (F-statistic):               0.00
Time:                        20:54:58   Log-Likelihood:                -5411.7
No. Observations:               15431   AIC:                         1.083e+04
Df Residuals:                   15426   BIC:                         1.087e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            5.5992      0.009    601.362   

In [35]:
print(est.params) # Coefficients 
print(est.rsquared) # R-squared 
print(est.pvalues) # P-values
print(est.conf_int()) # Confidence Intervals

const           5.599189
Graduate        0.415243
High School    -0.792317
Some College   -0.390325
age            -0.030138
dtype: float64
0.8069989763375707
const           0.0
Graduate        0.0
High School     0.0
Some College    0.0
age             0.0
dtype: float64
                     0         1
const         5.580939  5.617440
Graduate      0.400066  0.430419
High School  -0.807571 -0.777063
Some College -0.405367 -0.375283
age          -0.030441 -0.029836


In [44]:
# Test for significant correlation between age and walking speed
correlation, p_value = scipy.stats.pearsonr(df['age'], df['walking_speed'])
print(f"\nCorrelation between age and walking speed: {correlation:.3f}")
print(f"P-value: {p_value:.3g}")

if p_value < 0.05:
    print("The correlation is statistically significant.")
else:
    print("The correlation is not statistically significant.")


Correlation between age and walking speed: -0.700
P-value: 0
The correlation is statistically significant.


In [None]:
# 2. Analyze costs:
#    - Simple analysis of insurance type effect
#    - Box plots and basic statistics (report coeffcients and confidence intervals)
#    - Calculate effect sizes

In [None]:
# 3. Advanced analysis:
#    - Education age interaction effects on walking speed
#    - Control for relevant confounders
#    - Report key statistics and p-values (report coeffcients and confidence intervals)