In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('ss_train_shock.csv')
High = pd.read_csv('High_train.csv')
Low = pd.read_csv('Low_train.csv')
data = data.drop(data.columns[0], axis=1)

In [None]:
data.describe()

In [None]:
High.describe()

In [None]:
Low.describe()

Whole Sample

In [None]:
data.columns

In [None]:
from fitter import Fitter, get_common_distributions, get_distributions
def find_best_distribution(data, variable_name):

    variable_data = data[variable_name].dropna()
    f = Fitter(variable_data, distributions=get_common_distributions())
    f.fit()
    f.summary()
    return f

variable_names = ['HEALTH_CHANGE', 'HBP', 'DIABETES', 'CANCER',
       'LUNGS', 'HEART_ATTACK', 'STROKE', 'PSYCH', 'ARTHRITIS', 'OUT_PT',
       'DRUGS', 'HOME_CARE', 'SPECIAL_FAC', 'HOSPITAL', 'DOCTOR',
       'NURSING_HOME']

distribution_params = {}
for variable in variable_names:
    print(f"Analyzing {variable}")
    dist = find_best_distribution(data, variable)
    distribution_params[variable] = dist.get_best(method='sumsquare_error')
    print(dist.get_best(method='sumsquare_error'))

In [None]:
shock_data = data[data['SHOCK2']==1]
no_shock_data = data[data['SHOCK2']==0]
variable_names = ['SPEND_SS']
spending = [shock_data, no_shock_data]
spending_results = {}
for i in range(2):
    spending_results[i] = find_best_distribution(spending[i], 'SPEND_SS')
    print(spending_results[i].get_best(method='sumsquare_error'))

In [None]:
from sklearn.linear_model import LogisticRegression
def modelling(data, predictors, response):
    # Create a lagged version of 'SHOCK2' to use as the target variable
    data['response_next'] = data.groupby('HHIDPN')[response].shift(-1)
    
    
    data_clean = data.dropna(subset=predictors + ['response_next'])
    
    X = data_clean[predictors]
    y = data_clean['response_next']
    #class_weights = {0: 1, 1: 13}
    model = LogisticRegression(max_iter=1000, class_weight='balanced')
    model.fit(X, y)
    
    return model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

train = pd.read_csv('ss_train_shock.csv')
test = pd.read_csv('ss_test_shock.csv')
def modelling_linear(data, data_test, predictors, response):
    data = data[data['AGE']<=90]
    data_test = data_test[data_test['AGE']<=90]
    data['response_next'] = data.groupby('HHIDPN')[response].shift(-1)
    data_test['response_next'] = data_test.groupby('HHIDPN')[response].shift(-1)
    data_clean = data.dropna(subset=predictors + ['response_next'])
    
    X_train = data_clean[predictors]
    y_train = data_clean['response_next']
    X_test = data_test[predictors]
    y_test = data_test['response_next']
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate residuals
    residuals = y_test - y_pred
    mean_residuals = np.mean(residuals)
    std_residuals = np.std(residuals)

    return model, mean_residuals, std_residuals

In [None]:
# def modelling_linear(data, predictors, response):
#     data['response_next'] = data.groupby('HHIDPN')[response].shift(-1)
#     data_clean = data.dropna(subset=predictors + ['response_next'])
    
#     X = data_clean[predictors]
#     y = data_clean['response_next']
    
#     pipeline = Pipeline([
#         ('scaler', StandardScaler()),
#         ('model', LinearRegression())
#     ])
    
#     pipeline.fit(X, y)
    
#     return pipeline

In [None]:
predictors = ['AGE', 'HEALTH_CHANGE', 'HBP', 'STROKE', 'ARTHRITIS', 'DRUGS', 'SPECIAL_FAC', 'HOSPITAL', 'DOCTOR', 'NURSING_HOME']
response = 'SHOCK2'
shock_model = modelling(data, predictors, response)

In [None]:
yes_pred = ['AGE', 'PSYCH', 'DRUGS', 'HOME_CARE', 'HOSPITAL', 'DOCTOR', 'NURSING_HOME']
spend_response = 'SPEND_SS'
yes_model, yes_mean, yes_resid= modelling_linear(train, test, yes_pred, spend_response)

In [None]:
no_pred = ['AGE', 'HEALTH_CHANGE', 'DRUGS', 'HOME_CARE', 'DOCTOR', 'NURSING_HOME']
no_model, no_mean, no_resid = modelling_linear(train, test, no_pred, spend_response)

In [None]:
from scipy import stats

def simulate_individual_data(num_individuals, start_age, end_age, distribution_params):
    simulated_complete_data = pd.DataFrame()

    # Simulate data for each individual
    for person_id in range(1, num_individuals + 1):
        # Initialize a DataFrame for the current individual's data across the specified age range
        individual_data = pd.DataFrame({
            'AGE': range(start_age, end_age + 1),
            'HHIDPN': person_id  # Assign the unique person ID to each row
        })

        # Simulate the data for each variable according to the distribution parameters
        for variable, params_dict in distribution_params.items():
            dist_name, params = next(iter(params_dict.items()))  # Get the distribution and its parameters
            dist = getattr(stats, dist_name)  
            individual_data[variable] = dist.rvs(size=end_age - start_age + 1, **params)
        
        simulated_complete_data = pd.concat([simulated_complete_data, individual_data], ignore_index=True)

    return simulated_complete_data

# Example usage
num_individuals = 10000

stacked_simulation = simulate_individual_data(num_individuals, 65, 90, distribution_params)
stacked_simulation

In [None]:
def add_shock_predictions(simulated_data, shock_model, predictors):
    
    predicted_shocks = shock_model.predict(simulated_data[predictors])
    
    simulated_data['SHOCK2'] = predicted_shocks
    
    return simulated_data

simulated_data_with_shock = add_shock_predictions(stacked_simulation, shock_model, predictors)

simulated_data_with_shock

In [None]:
def add_spend_ss_predictions(simulated_data, yes_model, no_model, yes_pred, no_pred):
    simulated_data['SPEND_SS'] = 0.0

    shock_indices = simulated_data[simulated_data['SHOCK2'] == 1].index
    yes_predictions = yes_model.predict(simulated_data.loc[shock_indices, yes_pred])
    yes_random_residuals = np.random.normal(loc=yes_mean, scale=yes_resid, size=len(shock_indices))
    simulated_data.loc[shock_indices, 'SPEND_SS'] = yes_predictions - yes_random_residuals

    no_shock_indices = simulated_data[simulated_data['SHOCK2'] == 0].index
    no_predictions = no_model.predict(simulated_data.loc[no_shock_indices, no_pred])
    no_random_residuals = np.random.normal(loc=no_mean, scale=no_resid, size=len(no_shock_indices))
    simulated_data.loc[no_shock_indices, 'SPEND_SS'] = no_predictions - no_random_residuals
    
    return simulated_data

# Example usage
simulated_data_final = add_spend_ss_predictions(simulated_data_with_shock, yes_model, no_model, yes_pred, no_pred)
simulated_data_final

High Cluster

In [None]:
data = pd.read_csv('High_train.csv')

In [None]:
from fitter import Fitter, get_common_distributions, get_distributions
def find_best_distribution(data, variable_name):

    variable_data = data[variable_name].dropna()
    f = Fitter(variable_data, distributions=get_common_distributions())
    f.fit()
    f.summary()
    return f

variable_names = ['HEALTH_CHANGE', 'HBP', 'DIABETES', 'CANCER',
       'LUNGS', 'HEART_ATTACK', 'STROKE', 'PSYCH', 'ARTHRITIS', 'OUT_PT',
       'DRUGS', 'HOME_CARE', 'SPECIAL_FAC', 'HOSPITAL', 'DOCTOR',
       'NURSING_HOME']

distribution_params = {}
for variable in variable_names:
    print(f"Analyzing {variable}")
    dist = find_best_distribution(data, variable)
    distribution_params[variable] = dist.get_best(method='sumsquare_error')
    print(dist.get_best(method='sumsquare_error'))

In [None]:
shock_data = data[data['SHOCK2']==1]
no_shock_data = data[data['SHOCK2']==0]
variable_names = ['SPEND_SS']
spending = [shock_data, no_shock_data]
spending_results = {}
for i in range(2):
    spending_results[i] = find_best_distribution(spending[i], 'SPEND_SS')
    print(spending_results[i].get_best(method='sumsquare_error'))

In [None]:
from sklearn.linear_model import LogisticRegression
def modelling(data, predictors, response):
    # Create a lagged version of 'SHOCK2' to use as the target variable
    data['response_next'] = data.groupby('HHIDPN')[response].shift(-1)
    
    
    data_clean = data.dropna(subset=predictors + ['response_next'])
    
    X = data_clean[predictors]
    y = data_clean['response_next']
    #class_weights = {0: 1, 1: 13}
    model = LogisticRegression(max_iter=1000, class_weight='balanced')
    model.fit(X, y)
    
    return model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

train = pd.read_csv('ss_train_shock.csv')
test = pd.read_csv('ss_test_shock.csv')
def modelling_linear(data, data_test, predictors, response):
    data = data[data['AGE']<=90]
    data_test = data_test[data_test['AGE']<=90]
    data['response_next'] = data.groupby('HHIDPN')[response].shift(-1)
    data_test['response_next'] = data_test.groupby('HHIDPN')[response].shift(-1)
    data_clean = data.dropna(subset=predictors + ['response_next'])
    
    X_train = data_clean[predictors]
    y_train = data_clean['response_next']
    X_test = data_test[predictors]
    y_test = data_test['response_next']
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate residuals
    residuals = y_test - y_pred
    mean_residuals = np.mean(residuals)
    std_residuals = np.std(residuals)

    return model, mean_residuals, std_residuals

In [None]:
predictors = ['AGE', 'HEALTH_CHANGE', 'HBP', 'STROKE', 'ARTHRITIS', 'DRUGS', 'SPECIAL_FAC', 'HOSPITAL', 'DOCTOR', 'NURSING_HOME']
response = 'SHOCK2'
shock_model = modelling(data, predictors, response)

yes_pred = ['AGE', 'PSYCH', 'DRUGS', 'HOME_CARE', 'HOSPITAL', 'DOCTOR', 'NURSING_HOME']
spend_response = 'SPEND_SS'
yes_model, yes_mean, yes_resid = modelling_linear(train, test, yes_pred, spend_response)

no_pred = ['AGE', 'HEALTH_CHANGE', 'DRUGS', 'HOME_CARE', 'DOCTOR', 'NURSING_HOME']
no_model, no_mean, no_resid = modelling_linear(train, test, no_pred, spend_response)

from scipy import stats

def simulate_individual_data(num_individuals, start_age, end_age, distribution_params):
    simulated_complete_data = pd.DataFrame()

    # Simulate data for each individual
    for person_id in range(1, num_individuals + 1):
        # Initialize a DataFrame for the current individual's data across the specified age range
        individual_data = pd.DataFrame({
            'AGE': range(start_age, end_age + 1),
            'HHIDPN': person_id  # Assign the unique person ID to each row
        })

        # Simulate the data for each variable according to the distribution parameters
        for variable, params_dict in distribution_params.items():
            dist_name, params = next(iter(params_dict.items()))  # Get the distribution and its parameters
            dist = getattr(stats, dist_name)  
            individual_data[variable] = dist.rvs(size=end_age - start_age + 1, **params)
        
        simulated_complete_data = pd.concat([simulated_complete_data, individual_data], ignore_index=True)

    return simulated_complete_data

# Example usage
num_individuals = 10000

stacked_simulation = simulate_individual_data(num_individuals, 65, 90, distribution_params)
stacked_simulation

def add_shock_predictions(simulated_data, shock_model, predictors):
    
    predicted_shocks = shock_model.predict(simulated_data[predictors])
    
    simulated_data['SHOCK2'] = predicted_shocks
    
    return simulated_data

simulated_data_with_shock = add_shock_predictions(stacked_simulation, shock_model, predictors)

simulated_data_with_shock

def add_spend_ss_predictions(simulated_data, yes_model, no_model, yes_pred, no_pred):
    simulated_data['SPEND_SS'] = 0.0

    shock_indices = simulated_data[simulated_data['SHOCK2'] == 1].index
    yes_predictions = yes_model.predict(simulated_data.loc[shock_indices, yes_pred])
    yes_random_residuals = np.random.normal(loc=yes_mean, scale=yes_resid, size=len(shock_indices))
    simulated_data.loc[shock_indices, 'SPEND_SS'] = yes_predictions - yes_random_residuals

    no_shock_indices = simulated_data[simulated_data['SHOCK2'] == 0].index
    no_predictions = no_model.predict(simulated_data.loc[no_shock_indices, no_pred])
    no_random_residuals = np.random.normal(loc=no_mean, scale=no_resid, size=len(no_shock_indices))
    simulated_data.loc[no_shock_indices, 'SPEND_SS'] = no_predictions - no_random_residuals
    
    return simulated_data

# Example usage
simulated_data_final_high = add_spend_ss_predictions(simulated_data_with_shock, yes_model, no_model, yes_pred, no_pred)
simulated_data_final

Low Cluster

In [None]:
data = pd.read_csv('Low_train.csv')

from fitter import Fitter, get_common_distributions, get_distributions
def find_best_distribution(data, variable_name):

    variable_data = data[variable_name].dropna()
    f = Fitter(variable_data, distributions=get_common_distributions())
    f.fit()
    f.summary()
    return f

variable_names = ['HEALTH_CHANGE', 'HBP', 'DIABETES', 'CANCER',
       'LUNGS', 'HEART_ATTACK', 'STROKE', 'PSYCH', 'ARTHRITIS', 'OUT_PT',
       'DRUGS', 'HOME_CARE', 'SPECIAL_FAC', 'HOSPITAL', 'DOCTOR',
       'NURSING_HOME']

distribution_params = {}
for variable in variable_names:
    print(f"Analyzing {variable}")
    dist = find_best_distribution(data, variable)
    distribution_params[variable] = dist.get_best(method='sumsquare_error')
    print(dist.get_best(method='sumsquare_error'))

shock_data = data[data['SHOCK2']==1]
no_shock_data = data[data['SHOCK2']==0]
variable_names = ['SPEND_SS']
spending = [shock_data, no_shock_data]
spending_results = {}
for i in range(2):
    spending_results[i] = find_best_distribution(spending[i], 'SPEND_SS')
    print(spending_results[i].get_best(method='sumsquare_error'))

from sklearn.linear_model import LogisticRegression
def modelling(data, predictors, response):
    # Create a lagged version of 'SHOCK2' to use as the target variable
    data['response_next'] = data.groupby('HHIDPN')[response].shift(-1)
    
    
    data_clean = data.dropna(subset=predictors + ['response_next'])
    
    X = data_clean[predictors]
    y = data_clean['response_next']
    #class_weights = {0: 1, 1: 13}
    model = LogisticRegression(max_iter=1000, class_weight='balanced')
    model.fit(X, y)
    
    return model

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

train = pd.read_csv('ss_train_shock.csv')
test = pd.read_csv('ss_test_shock.csv')
def modelling_linear(data, data_test, predictors, response):
    data = data[data['AGE']<=90]
    data_test = data_test[data_test['AGE']<=90]
    data['response_next'] = data.groupby('HHIDPN')[response].shift(-1)
    data_test['response_next'] = data_test.groupby('HHIDPN')[response].shift(-1)
    data_clean = data.dropna(subset=predictors + ['response_next'])
    
    X_train = data_clean[predictors]
    y_train = data_clean['response_next']
    X_test = data_test[predictors]
    y_test = data_test['response_next']
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate residuals
    residuals = y_test - y_pred
    mean_residuals = np.mean(residuals)
    std_residuals = np.std(residuals)

    return model, mean_residuals, std_residuals

predictors = ['AGE', 'HEALTH_CHANGE', 'HBP', 'STROKE', 'ARTHRITIS', 'DRUGS', 'SPECIAL_FAC', 'HOSPITAL', 'DOCTOR', 'NURSING_HOME']
response = 'SHOCK2'
shock_model = modelling(data, predictors, response)

yes_pred = ['AGE', 'PSYCH', 'DRUGS', 'HOME_CARE', 'HOSPITAL', 'DOCTOR', 'NURSING_HOME', 'CANCER']
spend_response = 'SPEND_SS'
yes_model, yes_mean, yes_resid = modelling_linear(train, test, yes_pred, spend_response)

no_pred = ['AGE', 'HEALTH_CHANGE', 'DRUGS', 'HOME_CARE', 'DOCTOR', 'NURSING_HOME', 'CANCER']
no_model, no_mean, no_resid = modelling_linear(train, test, no_pred, spend_response)

from scipy import stats

def simulate_individual_data(num_individuals, start_age, end_age, distribution_params):
    simulated_complete_data = pd.DataFrame()

    # Simulate data for each individual
    for person_id in range(1, num_individuals + 1):
        # Initialize a DataFrame for the current individual's data across the specified age range
        individual_data = pd.DataFrame({
            'AGE': range(start_age, end_age + 1),
            'HHIDPN': person_id  # Assign the unique person ID to each row
        })

        # Simulate the data for each variable according to the distribution parameters
        for variable, params_dict in distribution_params.items():
            dist_name, params = next(iter(params_dict.items()))  # Get the distribution and its parameters
            dist = getattr(stats, dist_name)  
            individual_data[variable] = dist.rvs(size=end_age - start_age + 1, **params)
        
        simulated_complete_data = pd.concat([simulated_complete_data, individual_data], ignore_index=True)

    return simulated_complete_data

# Example usage
num_individuals = 10000

stacked_simulation = simulate_individual_data(num_individuals, 65, 90, distribution_params)
stacked_simulation

def add_shock_predictions(simulated_data, shock_model, predictors):
    
    predicted_shocks = shock_model.predict(simulated_data[predictors])
    
    simulated_data['SHOCK2'] = predicted_shocks
    
    return simulated_data

simulated_data_with_shock = add_shock_predictions(stacked_simulation, shock_model, predictors)

simulated_data_with_shock

def add_spend_ss_predictions(simulated_data, yes_model, no_model, yes_pred, no_pred):
    simulated_data['SPEND_SS'] = 0.0

    shock_indices = simulated_data[simulated_data['SHOCK2'] == 1].index
    yes_predictions = yes_model.predict(simulated_data.loc[shock_indices, yes_pred])
    yes_random_residuals = np.random.normal(loc=yes_mean, scale=yes_resid, size=len(shock_indices))
    simulated_data.loc[shock_indices, 'SPEND_SS'] = yes_predictions - yes_random_residuals

    no_shock_indices = simulated_data[simulated_data['SHOCK2'] == 0].index
    no_predictions = no_model.predict(simulated_data.loc[no_shock_indices, no_pred])
    no_random_residuals = np.random.normal(loc=no_mean, scale=no_resid, size=len(no_shock_indices))
    simulated_data.loc[no_shock_indices, 'SPEND_SS'] = no_predictions - no_random_residuals
    
    return simulated_data

# Example usage
simulated_data_final_low = add_spend_ss_predictions(simulated_data_with_shock, yes_model, no_model, yes_pred, no_pred)
simulated_data_final

In [None]:
data = pd.read_csv('ss_train_shock.csv')

In [None]:
def plot_average_spending_by_age(simulated_data, simulated_data_low, simulated_data_high):
    average_spending_by_age = simulated_data.groupby('AGE')['SPEND_SS'].mean()
    average_spending_by_age_low = simulated_data_low.groupby('AGE')['SPEND_SS'].mean()
    average_spending_by_age_high = simulated_data_high.groupby('AGE')['SPEND_SS'].mean()
    sample_spending = data[data['AGE']<=90].groupby('AGE')['SPEND_SS'].mean()
    plt.figure(figsize=(10, 6))
    average_spending_by_age.plot(kind='line', marker='x', label='General Simulated Spending',color='blue')
    average_spending_by_age_low.plot(kind='line', marker='x', label='Simulated Spending - Low Cluster',color='red')
    average_spending_by_age_high.plot(kind='line', marker='x', label='Simulated Spending - High Cluster',color='green')
    sample_spending.plot(kind='line', marker='x', label='Sample Spending',color='black')
    plt.title('Average Spending by Age')
    plt.xlabel('Age')
    plt.ylabel('Average Spending')
    plt.grid(True)
    plt.legend()
    plt.show()


# Example usage with the simulated data
plot_average_spending_by_age(simulated_data_final, simulated_data_final_low, simulated_data_final_high)

In [None]:
def plot_median_spending_by_age(simulated_data, simulated_data_low, simulated_data_high):
    average_spending_by_age = simulated_data.groupby('AGE')['SPEND_SS'].median()
    average_spending_by_age_low = simulated_data_low.groupby('AGE')['SPEND_SS'].median()
    average_spending_by_age_high = simulated_data_high.groupby('AGE')['SPEND_SS'].median()
    sample_spending = data[data['AGE']<=90].groupby('AGE')['SPEND_SS'].median()
    plt.figure(figsize=(10, 6))
    average_spending_by_age.plot(kind='line', marker='x', label='General Simulated Spending',color='blue')
    average_spending_by_age_low.plot(kind='line', marker='x', label='Simulated Spending - Low Cluster',color='red')
    average_spending_by_age_high.plot(kind='line', marker='x', label='Simulated Spending - High Cluster',color='green')
    sample_spending.plot(kind='line', marker='x', label='Sample Spending',color='black')
    plt.title('Median Spending by Age')
    plt.xlabel('Age')
    plt.ylabel('Median Spending')
    plt.grid(True)
    plt.legend()
    plt.show()


# Example usage with the simulated data
plot_median_spending_by_age(simulated_data_final, simulated_data_final_low, simulated_data_final_high)