# ASCVD

In [None]:
from datetime import datetime
import os
import pandas as pd
import gzip
import subprocess

In [None]:
df = pd.read_csv('APOE_file_ASCVD.csv')

In [None]:
import pandas as pd

# Count the number of rows with the same value in the column
value_counts = df['person_id'].value_counts()

# Calculate the mean and median of the counts
mean_counts = value_counts.mean()
median_counts = value_counts.median()

print(f"Mean of counts: {mean_counts}")
print(f"Median of counts: {median_counts}")


In [None]:
#pip install pandas matplotlib seaborn scipy statannot

# HDL and Total Cholesterol Changes

In [None]:
df = pd.read_csv('APOE_file_Final_df_CVD.csv')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Create age groups in 10-year intervals
df['age_group'] = pd.cut(df['Age'], bins=range(0, 101, 10), right=False)

def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers from HDL levels
df_no_outliers = df.groupby(['age_group', 'APOE_group']).apply(lambda x: remove_outliers(x, 'HDL')).reset_index(drop=True)

# Plotting
plt.figure(figsize=(12, 6))
sns.boxplot(x='age_group', y='HDL', hue='APOE_group', data=df_no_outliers, palette="Set3")

# Customizing the plot
plt.title('HDL Levels for Different APOE Groups Across Age Groups')
plt.xlabel('Age Group')
plt.ylabel('HDL Level')
plt.legend(title='APOE Group', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()

# Show plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
from statannot import add_stat_annotation

# Create age groups in 10-year intervals
df['age_group'] = pd.cut(df['Age'], bins=range(0, 101, 10), right=False)

# Function to remove outliers based on IQR
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers from HDL levels
df_no_outliers = df.groupby(['age_group', 'APOE_group']).apply(lambda x: remove_outliers(x, 'HDL')).reset_index(drop=True)

# Plotting
plt.figure(figsize=(20, 50))
ax = sns.boxplot(x='age_group', y='HDL', hue='APOE_group', data=df_no_outliers, palette="Set3")

# Customizing the plot
plt.title('HDL Levels for Different APOE Groups Across Age Groups (Outliers Removed)')
plt.xlabel('Age Group')
plt.ylabel('HDL Level')
plt.legend(title='APOE Group', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()

# Perform pairwise t-tests and annotate only significant results
age_groups = df_no_outliers['age_group'].cat.categories.tolist()
pairs = [(age_groups[i], age_groups[j]) for i in range(len(age_groups)) for j in range(i+1, len(age_groups))]

for apoe_group in df_no_outliers['APOE_group'].unique():
    subset = df_no_outliers[df_no_outliers['APOE_group'] == apoe_group]
    # Create valid pairs for the current subset
    valid_pairs = [(group1, group2) for group1, group2 in pairs if group1 in subset['age_group'].values and group2 in subset['age_group'].values]
    
    significant_pairs = []
    for pair in valid_pairs:
        data1 = subset[subset['age_group'] == pair[0]]['HDL']
        data2 = subset[subset['age_group'] == pair[1]]['HDL']
        t_stat, p_val = ttest_ind(data1, data2)
        if p_val < 0.05:
            significant_pairs.append(pair)
    
    if significant_pairs:
        add_stat_annotation(
            ax, data=subset, x='age_group', y='HDL', 
            box_pairs=significant_pairs, test='t-test_ind', text_format='star', loc='inside', verbose=2, pvalue_thresholds=[(0.05, '*')]
        )

# Show plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
from statannot import add_stat_annotation

# Create age groups in 10-year intervals
df['age_group'] = pd.cut(df['Age'], bins=range(0, 101, 10), right=False)

# Function to remove outliers based on IQR
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers from HDL levels
df_no_outliers = df.groupby(['age_group', 'APOE_group']).apply(lambda x: remove_outliers(x, 'HDL')).reset_index(drop=True)

# Plotting
plt.figure(figsize=(15, 20))
ax = sns.boxplot(x='age_group', y='HDL', hue='APOE_group', data=df_no_outliers, palette="Set3")

# Customizing the plot
plt.title('HDL Levels for Different APOE Groups Across Age Groups (Outliers Removed)')
plt.xlabel('Age Group')
plt.ylabel('HDL Level')
plt.legend(title='APOE Group', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()

# Perform pairwise t-tests and annotate only significant results
age_groups = df_no_outliers['age_group'].cat.categories.tolist()
pairs = [(age_groups[i], age_groups[j]) for i in range(len(age_groups)) for j in range(i+1, len(age_groups))]

significant_pairs = []
for pair in pairs:
    data1 = df_no_outliers[df_no_outliers['age_group'] == pair[0]]['HDL']
    data2 = df_no_outliers[df_no_outliers['age_group'] == pair[1]]['HDL']
    t_stat, p_val = ttest_ind(data1, data2)
    if p_val < 0.05:
        significant_pairs.append(pair)

if significant_pairs:
    add_stat_annotation(
        ax, data=df_no_outliers, x='age_group', y='HDL', 
        box_pairs=significant_pairs, test='t-test_ind', text_format='star', loc='inside', verbose=2, pvalue_thresholds=[(0.05, '*')]
    )

# Show plot
plt.show()


# ASCVD calculation

In [None]:
df = pd.read_csv('APOE_file_Final_df_CVD.csv')

df = df[((df["CVD"]==1) & (df["CVD_at_timeLab"]==1)) | (df["CVD"]==0)]

In [None]:
import math

def calculate_ascvd_risk(Age, Gender_modified, Race_modified, Total_Cholesterol, HDL, SBP, Hypertension_Med_Used, Diabetes_at_timeLab, smoker):
    # Coefficients for the Pooled Cohort Equations
    if Gender_modified == 'Male':
        if Race_modified == 'White':
            coeff = {
                'ln_age': 12.344,
                'ln_ageS': 0,
                'ln_total_chol': 11.853,
                'ln_age_ln_total_chol': -2.664,
                'ln_hdl': -7.990,
                'ln_age_ln_hdl': 1.769,
                'ln_sbp_treated': 1.797,
                'ln_age_sbp_treated': 0,
                'ln_sbp_untreated': 1.764,
                'ln_age_sbp_untreated': 0,
                'smoker': 7.837,
                'ln_age_smoker': -1.795,
                'diabetes': 0.658
            }
            baseline_survival = 0.9144
            mean = 61.18
        elif Race_modified == 'Black or African American':
            coeff = {
                'ln_age': 2.469,
                'ln_ageS': 0,
                'ln_total_chol': 0.302,
                'ln_age_ln_total_chol': 0,
                'ln_hdl': -0.307,
                'ln_age_ln_hdl': 0,
                'ln_sbp_treated': 1.916,
                'ln_age_sbp_treated': 0,
                'ln_sbp_untreated': 1.809,
                'ln_age_sbp_untreated': 0,
                'smoker': 0.549,
                'ln_age_smoker': 0,
                'diabetes': 0.645
            }
            baseline_survival = 0.8954
            mean = 19.54
        else:
            raise ValueError("Race must be 'White' or 'Black or African American'")
    elif Gender_modified == 'Female':
        if Race_modified == 'White':
            coeff = {
                'ln_age': -29.799,
                'ln_ageS': 4.884,
                'ln_total_chol': 13.540,
                'ln_age_ln_total_chol': -3.114,
                'ln_hdl': -13.578,
                'ln_age_ln_hdl': 3.149,
                'ln_sbp_treated': 2.019,
                'ln_age_sbp_treated': 0,
                'ln_sbp_untreated': 1.957,
                'ln_age_sbp_untreated': 0,
                'smoker': 7.574,
                'ln_age_smoker': -1.665,
                'diabetes': 0.661
            }
            baseline_survival = 0.9665
            mean = -29.18
        elif Race_modified == 'Black or African American':
            coeff = {
                'ln_age': 17.114,
                'ln_ageS': 0,
                'ln_total_chol': 0.940,
                'ln_age_ln_total_chol': 0,
                'ln_hdl': -18.920,
                'ln_age_ln_hdl': 4.475,
                'ln_sbp_treated': 29.291,
                'ln_age_sbp_treated': -6.432,
                'ln_sbp_untreated': 27.820,
                'ln_age_sbp_untreated': -6.087,
                'smoker': 0.691,
                'ln_age_smoker': 0,
                'diabetes': 0.874
            }
            baseline_survival = 0.9533
            mean = 86.61
        else:
            raise ValueError("Race must be 'White' or 'Black or African American'")
    else:
        raise ValueError("Gender must be 'Male' or 'Female'")
    
    # Ensure that none of the log inputs are zero or negative to avoid math domain errors
    if Age <= 0 or Total_Cholesterol <= 0 or HDL <= 0 or SBP <= 0:
        raise ValueError("Age, Total Cholesterol, HDL, and SBP must be greater than zero")
    
    # Calculate the individual components
    ln_age = math.log(Age)
    ln_total_chol = math.log(Total_Cholesterol)
    ln_hdl = math.log(HDL)
    ln_sbp = math.log(SBP)
    
    # Calculate the sum of the components
    sum_components = (coeff['ln_age'] * ln_age +
                      coeff['ln_ageS'] * (ln_age**2) +
                      coeff['ln_total_chol'] * ln_total_chol +
                      coeff['ln_age_ln_total_chol'] * ln_age * ln_total_chol +
                      coeff['ln_hdl'] * ln_hdl +
                      coeff['ln_age_ln_hdl'] * ln_age * ln_hdl +
                      (coeff['ln_sbp_treated'] if Hypertension_Med_Used else coeff['ln_sbp_untreated']) * ln_sbp +
                      (coeff['ln_age_sbp_treated'] if Hypertension_Med_Used else coeff['ln_age_sbp_untreated']) * ln_age * ln_sbp +
                      coeff['smoker'] * smoker +
                      coeff['ln_age_smoker'] * ln_age * smoker +
                      coeff['diabetes'] * Diabetes_at_timeLab)
    
    # Calculate the risk score
    risk_score = 1 - (baseline_survival ** math.exp(sum_components - mean))
    
    # Convert risk score to percentage
    risk_percentage = risk_score * 100
    
    return risk_percentage

In [None]:
df['ascvd_risk'] = df.apply(lambda row: calculate_ascvd_risk(
    row['Age'],
    row['Gender_modified'],
    row['Race_modified'],
    row['Total_Cholesterol'],
    row['HDL'],
    row['SBP'],
    row['Hypertension_Med_Used'],
    row['Diabetes_at_timeLab'],
    row['smoker']
), axis=1)

In [None]:
my_dataframe = df

# Replace 'test.csv' with THE NAME of the file you're going to store in the bucket (don't delete the quotation marks)
destination_filename = 'APOE_file_ASCVD.csv'

# save dataframe in a csv file in the same workspace as the notebook
my_dataframe.to_csv(destination_filename, index=False)

# get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# copy csv file to the bucket
args = ["gsutil", "cp", f"./{destination_filename}", f"{my_bucket}/data/"]
output = subprocess.run(args, capture_output=True)

# print output from gsutil
output.stderr

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv('APOE_file_ASCVD.csv')
# Create the box plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='APOE_group', y='ascvd_risk', data=df)

# Add labels and title
plt.xlabel('APOE group')
plt.ylabel('ASCVD')
#plt.title('Box Plot of Values Grouped by Group Column')

# Show the plot
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from statannot import add_stat_annotation

# Read the data
df = pd.read_csv('APOE_file_ASCVD.csv')

# Define the order of the APOE groups
apoe_order = ['e2e2', 'e2e3', 'e3e4', 'e4e4']

# Filter the DataFrame to include only the specified APOE groups
df = df[df['APOE_group'].isin(apoe_order)]

# Create the box plot with specified order
plt.figure(figsize=(10, 6))
ax = sns.boxplot(x='APOE_group', y='ascvd_risk', data=df, order=apoe_order)

# Add labels and title with increased font size
plt.xlabel('APOE group', fontsize=14)
plt.ylabel('ASCVD', fontsize=14)
plt.title('Box Plot of ASCVD Risk by APOE Group', fontsize=16)

# Increase tick parameters font size
ax.tick_params(axis='both', which='major', labelsize=12)

# Create pairs for comparison with the specified order
pairs = [(apoe_order[i], apoe_order[j]) for i in range(len(apoe_order)) for j in range(i+1, len(apoe_order))]

# Perform pairwise t-tests and annotate all results
add_stat_annotation(
    ax, data=df, x='APOE_group', y='ascvd_risk', 
    box_pairs=pairs, order=apoe_order, test='t-test_ind', text_format='star', loc='inside', verbose=2
)

# Show the plot
plt.show()


# Matching

In [None]:
import pandas as pd
import numpy as np

# Read the CSV file
df = pd.read_csv('APOE_file_ASCVD.csv')

# Create the new column 'New_Apoe_Group' based on multiple conditions
df["New_Apoe_Group"] = np.where(df["APOE_group"].isin(["e2e2", "e2e3"]), "e2carrier", 
                       np.where(df["APOE_group"].isin(["e3e4", "e4e4"]), "e4carrier", "Remove"))

# Filter out rows where 'New_Apoe_Group' is null
df = df[df["New_Apoe_Group"]!="Remove"]

# Define age groups
age_bins = [0, 40, 60, 80, 100]
age_labels = ['<40', '40-60', '60-80', '80+']
df['Age_Group'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from statannot import add_stat_annotation

# Define the order of the APOE groups
apoe_order = ['e2carrier', 'e4carrier']

# Filter the DataFrame to include only the specified APOE groups
df = df[df['New_Apoe_Group'].isin(apoe_order)]

# Create the box plot with specified order
plt.figure(figsize=(10, 6))
ax = sns.boxplot(x='New_Apoe_Group', y='ascvd_risk', data=df, order=apoe_order)

# Add labels and title with increased font size
plt.xlabel('APOE group', fontsize=14)
plt.ylabel('ASCVD', fontsize=14)
plt.title('Box Plot of ASCVD Risk by APOE Group', fontsize=16)

# Increase tick parameters font size
ax.tick_params(axis='both', which='major', labelsize=12)

# Create pairs for comparison with the specified order
pairs = [(apoe_order[i], apoe_order[j]) for i in range(len(apoe_order)) for j in range(i+1, len(apoe_order))]

# Perform pairwise t-tests and annotate all results
add_stat_annotation(
    ax, data=df, x='New_Apoe_Group', y='ascvd_risk', 
    box_pairs=pairs, order=apoe_order, test='t-test_ind', text_format='star', loc='inside', verbose=2
)

# Show the plot
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from statannot import add_stat_annotation

# Define the order of the APOE groups
apoe_order = ['e2carrier', 'e4carrier']

# Filter the DataFrame to include only the specified APOE groups
df = df[df['New_Apoe_Group'].isin(apoe_order)]

# Create the box plot with specified order and age groups
plt.figure(figsize=(14, 8))
ax = sns.boxplot(x='New_Apoe_Group', y='ascvd_risk', hue='Age_Group', data=df, order=apoe_order)

# Add labels and title with increased font size
plt.xlabel('APOE group', fontsize=14)
plt.ylabel('ASCVD', fontsize=14)
plt.title('Box Plot of ASCVD Risk by APOE Group and Age Group', fontsize=16)

# Increase tick parameters font size
ax.tick_params(axis='both', which='major', labelsize=12)

# Create pairs for comparison with the specified order
apoe_pairs = [
    ((apoe_order[i], age_labels[k]), (apoe_order[j], age_labels[k])) 
    for i in range(len(apoe_order)) 
    for j in range(i+1, len(apoe_order)) 
    for k in range(len(age_labels))
]

age_pairs = [
    ((apoe_order[i], age_labels[k]), (apoe_order[i], age_labels[l]))
    for i in range(len(apoe_order))
    for k in range(len(age_labels))
    for l in range(k+1, len(age_labels))
]

# Combine all pairs
pairs = apoe_pairs + age_pairs

# Perform pairwise t-tests and annotate all results
add_stat_annotation(
    ax, data=df, x='New_Apoe_Group', y='ascvd_risk', hue='Age_Group',
    box_pairs=pairs, order=apoe_order, test='t-test_ind', text_format='star', loc='inside', verbose=2
)

# Show the plot
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from statannot import add_stat_annotation

# Define the order of the APOE groups
apoe_order = ['e2carrier', 'e4carrier']

# Filter the DataFrame to include only the specified APOE groups
df = df[df['New_Apoe_Group'].isin(apoe_order)]

# Create the box plot with specified order and age groups
plt.figure(figsize=(14, 8))
ax = sns.boxplot(x='New_Apoe_Group', y='ascvd_risk', hue='Age_Group', data=df, order=apoe_order)

# Calculate means for each combination of APOE group and Age group
means = df.groupby(['New_Apoe_Group', 'Age_Group'])['ascvd_risk'].mean().reset_index()

# Get unique hue levels
hue_levels = df['Age_Group'].unique()

# Overlay mean values as points and add mean lines
for index, row in means.iterrows():
    group = row['New_Apoe_Group']
    age_group = row['Age_Group']
    mean_value = row['ascvd_risk']
    
    # Find the position of the mean in the boxplot
    group_pos = apoe_order.index(group)
    hue_pos = hue_levels.tolist().index(age_group)
    
    # Get the positions from the box plot and adjust one box to the left
    pos = group_pos + (hue_pos - 0.5) * 0.2 - 0.2
    
    ax.plot([pos], [mean_value], 'k-', marker='d', markersize=8, linestyle='-', linewidth=2)
    ax.annotate(f'{mean_value:.2f}', xy=(pos, mean_value), xytext=(0, 10),
                textcoords='offset points', ha='center', fontsize=12, color='black')

# Add labels and title with increased font size
plt.xlabel('APOE group', fontsize=14)
plt.ylabel('ASCVD', fontsize=14)
plt.title('Box Plot of ASCVD Risk by APOE Group and Age Group', fontsize=16)

# Increase tick parameters font size
ax.tick_params(axis='both', which='major', labelsize=12)

# Create pairs for comparison with the specified order
age_labels = df['Age_Group'].unique()
apoe_pairs = [
    ((apoe_order[i], age_labels[k]), (apoe_order[j], age_labels[k])) 
    for i in range(len(apoe_order)) 
    for j in range(i+1, len(apoe_order)) 
    for k in range(len(age_labels))
]

age_pairs = [
    ((apoe_order[i], age_labels[k]), (apoe_order[i], age_labels[l]))
    for i in range(len(apoe_order))
    for k in range(len(age_labels))
    for l in range(k+1, len(age_labels))
]

# Combine all pairs
pairs = apoe_pairs + age_pairs

# Perform pairwise t-tests and annotate all results
add_stat_annotation(
    ax, data=df, x='New_Apoe_Group', y='ascvd_risk', hue='Age_Group',
    box_pairs=pairs, order=apoe_order, hue_order=age_labels, test='t-test_ind', text_format='star', loc='inside', verbose=2
)

# Show the plot
plt.show()


In [None]:
import pandas as pd

# Define the mapping for statin use
statin_mapping = {
    'Low': 1,
    'Moderate': 2,
    'High': 3
}

# Replace values in 'statin_use' column using the mapping, and set default to 0
df['statin_use'] = df['statin_use'].map(statin_mapping).fillna(0).astype(int)

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

# Define the treatment and covariates
treatment = 'New_Apoe_Group'
covariates = ["Age", "Gender_modified", "Race_modified", "statin_use"]

# Function to convert categorical variables to numeric
def convert_gender(gender):
    return 1 if gender == 'Male' else 0

def convert_race(race):
    return 1 if race == 'White' else 0

# Apply the conversion functions
df['Gender_modified'] = df['Gender_modified'].apply(convert_gender)
df['Race_modified'] = df['Race_modified'].apply(convert_race)

# Encode the treatment variable
df['treatment'] = df[treatment].apply(lambda x: 1 if x == 'e4carrier' else 0)

# Drop rows with missing values in covariates
df = df.dropna(subset=covariates + ['treatment'])

# Fit a logistic regression model to estimate propensity scores
model = sm.Logit(df['treatment'], sm.add_constant(df[covariates]))
result = model.fit()

# Add propensity scores to the DataFrame
df['propensity_score'] = result.predict(sm.add_constant(df[covariates]))

# Standardize the propensity scores
scaler = StandardScaler()
df['propensity_score_std'] = scaler.fit_transform(df[['propensity_score']])

# Perform nearest neighbor matching
treatment_indices = df[df['treatment'] == 1].index
control_indices = df[df['treatment'] == 0].index

X_treatment = df.loc[treatment_indices, 'propensity_score_std'].values.reshape(-1, 1)
X_control = df.loc[control_indices, 'propensity_score_std'].values.reshape(-1, 1)

nn = NearestNeighbors(n_neighbors=1)
nn.fit(X_control)
distances, indices = nn.kneighbors(X_treatment)

# Create a DataFrame with the matched pairs
matched_indices = control_indices[indices.flatten()]
matched_pairs = pd.DataFrame({
    'Treatment Index': treatment_indices,
    'Control Index': matched_indices
})

# Merge the matched pairs back with the original DataFrame
matched_df_treatment = df.loc[matched_pairs['Treatment Index']].reset_index(drop=True)
matched_df_control = df.loc[matched_pairs['Control Index']].reset_index(drop=True)

# Combine matched treatment and control groups into a single DataFrame
matched_df_treatment['group'] = 'e4carrier'
matched_df_control['group'] = 'e2carrier'

matched_df_combined = pd.concat([matched_df_treatment, matched_df_control])

# Bar plots to show the frequency of CVD column based on treatment group
fig, ax = plt.subplots(1, 2, figsize=(12, 6))

# Before matching
cvd_counts_before = df.groupby('New_Apoe_Group')['CVD'].value_counts().unstack().fillna(0)
cvd_counts_before.plot(kind='bar', stacked=True, ax=ax[0])
ax[0].set_title('CVD Frequency Before Matching')
ax[0].set_xlabel('APOE Group')
ax[0].set_ylabel('Frequency')
ax[0].legend(title='CVD')

# After matching
cvd_counts_after = matched_df_combined.groupby('group')['CVD'].value_counts().unstack().fillna(0)
cvd_counts_after.plot(kind='bar', stacked=True, ax=ax[1])
ax[1].set_title('CVD Frequency After Matching')
ax[1].set_xlabel('Group')
ax[1].set_ylabel('Frequency')
ax[1].legend(title='CVD')

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from statannot import add_stat_annotation

# Define the order of the APOE groups
apoe_order = ['e2carrier', 'e4carrier']

# Filter the DataFrame to include only the specified APOE groups
matched_df_combined = matched_df_combined[matched_df_combined['New_Apoe_Group'].isin(apoe_order)]

# Create the box plot with specified order and age groups
plt.figure(figsize=(14, 8))
ax = sns.boxplot(x='New_Apoe_Group', y='ascvd_risk', hue='Age_Group', data=matched_df_combined, order=apoe_order)

# Calculate means for each combination of APOE group and Age group
means = matched_df_combined.groupby(['New_Apoe_Group', 'Age_Group'])['ascvd_risk'].mean().reset_index()

# Get unique hue levels
hue_levels = matched_df_combined['Age_Group'].unique()

# Overlay mean values as points and add mean lines
for index, row in means.iterrows():
    group = row['New_Apoe_Group']
    age_group = row['Age_Group']
    mean_value = row['ascvd_risk']
    
    # Find the position of the mean in the boxplot
    group_pos = apoe_order.index(group)
    hue_pos = hue_levels.tolist().index(age_group)
    
    # Get the positions from the box plot and adjust one box to the left
    pos = group_pos + (hue_pos - 0.5) * 0.2 - 0.2
    
    ax.plot([pos], [mean_value], 'k-', marker='d', markersize=8, linestyle='-', linewidth=2)
    ax.annotate(f'{mean_value:.2f}', xy=(pos, mean_value), xytext=(0, 10),
                textcoords='offset points', ha='center', fontsize=12, color='black')

# Add labels and title with increased font size
plt.xlabel('APOE group', fontsize=14)
plt.ylabel('ASCVD', fontsize=14)
plt.title('Box Plot of ASCVD Risk by APOE Group and Age Group', fontsize=16)

# Increase tick parameters font size
ax.tick_params(axis='both', which='major', labelsize=12)

# Create pairs for comparison with the specified order
age_labels = matched_df_combined['Age_Group'].unique()
apoe_pairs = [
    ((apoe_order[i], age_labels[k]), (apoe_order[j], age_labels[k])) 
    for i in range(len(apoe_order)) 
    for j in range(i+1, len(apoe_order)) 
    for k in range(len(age_labels))
]

age_pairs = [
    ((apoe_order[i], age_labels[k]), (apoe_order[i], age_labels[l]))
    for i in range(len(apoe_order))
    for k in range(len(age_labels))
    for l in range(k+1, len(age_labels))
]

# Combine all pairs
pairs = apoe_pairs + age_pairs

# Perform pairwise t-tests and annotate all results
add_stat_annotation(
    ax, data=matched_df_combined, x='New_Apoe_Group', y='ascvd_risk', hue='Age_Group',
    box_pairs=pairs, order=apoe_order, hue_order=age_labels, test='t-test_ind', text_format='star', loc='inside', verbose=2
)

# Show the plot
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, mannwhitneyu

# Create the box plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='New_Apoe_Group', y='ascvd_risk', data=matched_df_combined)

# Add labels and title
plt.xlabel('APOE group')
plt.ylabel('ASCVD')
plt.title('Box Plot of ASCVD Risk by APOE Group')

# Show the plot
plt.show()

# Separate the data into two groups
group1 = matched_df_combined[matched_df_combined['New_Apoe_Group'] == 'e2carrier']['ascvd_risk']
group2 = matched_df_combined[matched_df_combined['New_Apoe_Group'] == 'e4carrier']['ascvd_risk']

# Perform t-test
t_stat, p_val_ttest = ttest_ind(group1, group2)
print(f"T-test: t-statistic = {t_stat}, p-value = {p_val_ttest}")

# Perform Mann-Whitney U test
u_stat, p_val_mannwhitney = mannwhitneyu(group1, group2)
print(f"Mann-Whitney U test: U-statistic = {u_stat}, p-value = {p_val_mannwhitney}")


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from statannot import add_stat_annotation

# Define the order of the APOE groups
apoe_order = ['e2carrier', 'e4carrier']

# Define age groups
age_bins = [0, 40, 60, 80, 100]
age_labels = ['<40', '40-60', '60-80', '80+']
matched_df_combined['Age_Group'] = pd.cut(matched_df_combined['Age'], bins=age_bins, labels=age_labels, right=False)

# Filter the DataFrame to include only the specified APOE groups
matched_df_combined = matched_df_combined[matched_df_combined['New_Apoe_Group'].isin(apoe_order)]

# Create the box plot with specified order
plt.figure(figsize=(10, 6))
ax = sns.boxplot(x='New_Apoe_Group', y='ascvd_risk', data=matched_df_combined, order=apoe_order)

# Add labels and title with increased font size
plt.xlabel('APOE group', fontsize=14)
plt.ylabel('ASCVD', fontsize=14)
plt.title('Box Plot of ASCVD Risk by APOE Group', fontsize=16)

# Increase tick parameters font size
ax.tick_params(axis='both', which='major', labelsize=12)

# Create pairs for comparison with the specified order
pairs = [(apoe_order[i], apoe_order[j]) for i in range(len(apoe_order)) for j in range(i+1, len(apoe_order))]

# Perform pairwise t-tests and annotate all results
add_stat_annotation(
    ax, data=matched_df_combined, x='New_Apoe_Group', y='ascvd_risk', 
    box_pairs=pairs, order=apoe_order, test='t-test_ind', text_format='star', loc='inside', verbose=2
)

# Show the plot
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from statannot import add_stat_annotation

# Define the order of the APOE groups
apoe_order = ['e2carrier', 'e4carrier']

# Filter the DataFrame to include only the specified APOE groups
matched_df_combined = matched_df_combined[matched_df_combined['New_Apoe_Group'].isin(apoe_order)]

# Define age groups
age_bins = [0, 40, 60, 80, 100]
age_labels = ['<40', '40-60', '60-80', '80+']
matched_df_combined['Age_Group'] = pd.cut(matched_df_combined['Age'], bins=age_bins, labels=age_labels, right=False)

# Create the box plot with specified order and age groups
plt.figure(figsize=(14, 8))
ax = sns.boxplot(x='New_Apoe_Group', y='ascvd_risk', hue='Age_Group', data=matched_df_combined, order=apoe_order)

# Add labels and title with increased font size
plt.xlabel('APOE group', fontsize=14)
plt.ylabel('ASCVD', fontsize=14)
plt.title('Box Plot of ASCVD Risk by APOE Group and Age Group', fontsize=16)

# Increase tick parameters font size
ax.tick_params(axis='both', which='major', labelsize=12)

# Create pairs for comparison with the specified order
apoe_pairs = [
    ((apoe_order[i], age_labels[k]), (apoe_order[j], age_labels[k])) 
    for i in range(len(apoe_order)) 
    for j in range(i+1, len(apoe_order)) 
    for k in range(len(age_labels))
]

age_pairs = [
    ((apoe_order[i], age_labels[k]), (apoe_order[i], age_labels[l]))
    for i in range(len(apoe_order))
    for k in range(len(age_labels))
    for l in range(k+1, len(age_labels))
]

# Combine all pairs
pairs = apoe_pairs + age_pairs

# Perform pairwise t-tests and annotate all results
add_stat_annotation(
    ax, data=matched_df_combined, x='New_Apoe_Group', y='ascvd_risk', hue='Age_Group',
    box_pairs=pairs, order=apoe_order, test='t-test_ind', text_format='star', loc='inside', verbose=2
)

# Show the plot
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, mannwhitneyu

# Create the box plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='New_Apoe_Group', y='Total_Cholesterol', data=df)

# Add labels and title
plt.xlabel('APOE group')
plt.ylabel('Total_Cholesterol')
plt.title('Box Plot of ASCVD Risk by APOE Group')

# Show the plot
plt.show()

# Separate the data into two groups
group1 = matched_df_combined[matched_df_combined['New_Apoe_Group'] == 'e2carrier']['ascvd_risk']
group2 = matched_df_combined[matched_df_combined['New_Apoe_Group'] == 'e4carrier']['ascvd_risk']

# Perform t-test
t_stat, p_val_ttest = ttest_ind(group1, group2)
print(f"T-test: t-statistic = {t_stat}, p-value = {p_val_ttest}")

# Perform Mann-Whitney U test
u_stat, p_val_mannwhitney = mannwhitneyu(group1, group2)
print(f"Mann-Whitney U test: U-statistic = {u_stat}, p-value = {p_val_mannwhitney}")


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, mannwhitneyu

# Create the box plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='New_Apoe_Group', y='HDL', data=df)

# Add labels and title
plt.xlabel('APOE group')
plt.ylabel('HDL')
plt.title('Box Plot of ASCVD Risk by APOE Group')

# Show the plot
plt.show()

# Separate the data into two groups
group1 = matched_df_combined[matched_df_combined['New_Apoe_Group'] == 'e2carrier']['ascvd_risk']
group2 = matched_df_combined[matched_df_combined['New_Apoe_Group'] == 'e4carrier']['ascvd_risk']

# Perform t-test
t_stat, p_val_ttest = ttest_ind(group1, group2)
print(f"T-test: t-statistic = {t_stat}, p-value = {p_val_ttest}")

# Perform Mann-Whitney U test
u_stat, p_val_mannwhitney = mannwhitneyu(group1, group2)
print(f"Mann-Whitney U test: U-statistic = {u_stat}, p-value = {p_val_mannwhitney}")


# Cardiovascular disease 

In [None]:
import pandas as pd
import numpy as np

# Read the CSV file
df = pd.read_csv('APOE_file_ASCVD.csv')

# Create the new column 'New_Apoe_Group' based on multiple conditions
df["New_Apoe_Group"] = np.where(df["APOE_group"].isin(["e2e2", "e2e3"]), "APOE2", 
                       np.where(df["APOE_group"].isin(["e3e4", "e4e4"]), "APOE4", "Remove"))

# Filter out rows where 'New_Apoe_Group' is null
df = df[df["New_Apoe_Group"]!="Remove"]

In [None]:
# Group by 'Group' column and sum the 'Value' column for each group
grouped_data = df.groupby('New_Apoe_Group')['CVD_at_timeLab'].sum()

# Create bar plot
plt.figure(figsize=(10, 6))
grouped_data.plot(kind='bar')
plt.title('Number o CVD')
plt.xlabel('New Apoe Group')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Group by 'Group' column and sum the 'Value' column for each group
grouped_data = df.groupby('APOE_group')['CVD_at_timeLab'].sum()

# Create bar plot
plt.figure(figsize=(10, 6))
grouped_data.plot(kind='bar')
plt.title('Number o CVD')
plt.xlabel('New Apoe Group')
plt.xticks(rotation=0)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Calculate the total number of patients in each APOE group
total_patients = df.groupby('New_Apoe_Group')['CVD_at_timeLab'].count()

# Calculate the number of patients with CVD in each APOE group
cvd_patients = df.groupby('New_Apoe_Group')['CVD_at_timeLab'].sum()

# Calculate the percentage of patients with CVD in each APOE group
cvd_percentage = (cvd_patients / total_patients) * 100

# Create bar plot
plt.figure(figsize=(10, 6))
cvd_percentage.plot(kind='bar', color='skyblue')
plt.title('Percentage of Patients with CVD in Each APOE Group')
plt.xlabel('APOE Group')
plt.ylabel('Percentage of Patients with CVD')
plt.xticks(rotation=0)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Calculate the total number of patients in each APOE group
total_patients = df.groupby('APOE_group')['CVD_at_timeLab'].count()

# Calculate the number of patients with CVD in each APOE group
cvd_patients = df.groupby('APOE_group')['CVD_at_timeLab'].sum()

# Calculate the percentage of patients with CVD in each APOE group
cvd_percentage = (cvd_patients / total_patients) * 100

# Create bar plot
plt.figure(figsize=(10, 6))
cvd_percentage.plot(kind='bar', color='skyblue')
plt.title('Percentage of Patients with CVD in Each APOE Group')
plt.xlabel('APOE Group')
plt.ylabel('Percentage of Patients with CVD')
plt.xticks(rotation=0)
plt.show()


# Time series analysis

In [None]:
###DESCRIPTIVE STAT

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('APOE_file_ASCVD.csv')

# Group by time and APOE group and calculate descriptive statistics
descriptive_stats = df.groupby(['Total_Cholesterol_time', 'APOE_group'])['HDL'].agg(['mean', 'std', 'count'])

# Plot the time series data
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='Total_Cholesterol_time', y='HDL', hue='APOE_group', ci='sd')
plt.title('HDL Levels Over Time by APOE Group')
plt.xlabel('Time')
plt.ylabel('HDL Level')
plt.legend(title='APOE Group')
plt.show()


In [None]:
###DESCRIPTIVE STAT

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('APOE_file_ASCVD.csv')
# Create the new column 'New_Apoe_Group' based on multiple conditions
df["New_Apoe_Group"] = np.where(df["APOE_group"].isin(["e2e2", "e2e3"]), "APOE2", 
                       np.where(df["APOE_group"].isin(["e3e4", "e4e4"]), "APOE4", "Remove"))

# Filter out rows where 'New_Apoe_Group' is null
df = df[df["New_Apoe_Group"]!="Remove"]

# Group by time and APOE group and calculate descriptive statistics
descriptive_stats = df.groupby(['Total_Cholesterol_time', 'APOE_group'])['HDL'].agg(['mean', 'std', 'count'])

# Plot the time series data
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='Total_Cholesterol_time', y='HDL', hue='APOE_group', ci='sd')
plt.title('HDL Levels Over Time by APOE Group')
plt.xlabel('Time')
plt.ylabel('HDL Level')
plt.legend(title='APOE Group')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='Total_Cholesterol_time', y='HDL', hue='APOE_group', ci=95)
plt.title('HDL Levels Over Time by APOE Group')
plt.xlabel('Time')
plt.ylabel('HDL Level')
plt.legend(title='APOE Group')
plt.show()

In [None]:
import statsmodels.api as sm

# Fit a mixed-effects model
model_mixed = smf.mixedlm('HDL ~ Total_Cholesterol_time * APOE_group', data=df, groups=df['person_id']).fit()
print(model_mixed.summary())
