In [1]:
import scipy.stats
import statsmodels.api as sm
import os
import pandas as pd
import numpy as np

In [3]:
# Set random seed.
np.random.seed(42)

# Read the processed CSV file.
df = pd.read_csv("ms_data.csv")

# Convert visit_date to datetime.
df['visit_date'] = pd.to_datetime(df['visit_date'])

#  Sort by patient_id and visit_date.
df = df.sort_values(by = ['patient_id', 'visit_date'])

# Read insurance types from `insurance.lst`.
with open('insurance.lst', 'r') as f:
    insurance_types = [line.strip() for line in f.readlines()]

# Randomly assign (but keep consistent per patient_id).
unique_patients = df['patient_id'].unique()
patient_insurance_map = {patient_id: np.random.choice(insurance_types) for patient_id in unique_patients}
df['insurance_type'] = df['patient_id'].map(patient_insurance_map)

# Generate visit costs based on insurance type. Different plans have different effects on cost.
base_costs = {'Medicare': 100,
    'Medicaid': 200,
    'Private': 50,
    'Other': 500
}

# Add random variation.
variation_factor = 0.2 # 20% variation
df['visit_cost'] = df['insurance_type'].map(base_costs) * (1 + np.random.uniform(-variation_factor, variation_factor, len(df))).round(3)

# Set appropriate data types.
df['patient_id'] = df['patient_id'].astype(str)
df['education_level'] = df['education_level'].astype(str)
df['insurance_type'] = df['insurance_type'].astype(str)

df.head()

Unnamed: 0,patient_id,visit_date,age,education_level,walking_speed,insurance_type,visit_cost
0,P0001,2020-01-23,34.28,Bachelors,4.4,Private,53.95
1,P0001,2020-04-16,34.51,Bachelors,4.19,Private,50.7
2,P0001,2020-07-03,34.72,Bachelors,4.71,Private,46.2
3,P0001,2020-10-15,35.0,Bachelors,4.86,Private,56.3
4,P0001,2020-12-29,35.21,Bachelors,4.5,Private,53.7


In [None]:
# 1. Analyze walking speed:
#    - Multiple regression with education and age (report coeffcients and confidence intervals)
#    - Account for repeated measures
#    - Test for significant trends

In [None]:
# 2. Analyze costs:
#    - Simple analysis of insurance type effect
#    - Box plots and basic statistics (report coeffcients and confidence intervals)
#    - Calculate effect sizes

In [None]:
# 3. Advanced analysis:
#    - Education age interaction effects on walking speed
#    - Control for relevant confounders
#    - Report key statistics and p-values (report coeffcients and confidence intervals)