#### Load required packages

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

#### Setup Cells

In [None]:
%matplotlib inline

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

#### Upload Data

In [None]:
input_data = '/Users/anna.maguza/Desktop/Data/COPD_project/Marburg_All_ctl230321_leiden_states.raw.h5ad'
output_data = '/Users/anna.maguza/Desktop/Data/COPD_project/Marburg_All_ctl230321_leiden_states_output.h5ad'

In [None]:
adata = sc.read_h5ad(input_data)

In [None]:
adata.obs

In [None]:
adata.obs['leiden_states'].value_counts()

In [None]:
adata.obs['group'].value_counts()

In [None]:
# Count the number of cells of each cell type in each group
cell_type_counts = adata.obs.groupby(['group', 'leiden_states'])['group'].count().unstack()

In [None]:
# Calculate the total number of cells in each group
total_cell_counts = cell_type_counts.sum(axis=1)

In [None]:
# Calculate the relative abundance of each cell type in each group
relative_abundances = (cell_type_counts.T / total_cell_counts).T

### COPD_IAV

In [None]:
# Calculate the percentage increase/decrease of each cell type in copd_iav relative to each other group separately
other_groups = ['healthy_ctrl', 'healthy_iav', 'copd_ctrl']
reference_abundances = relative_abundances.loc[other_groups].mean(axis=0)
percentage_changes = {}
for group in other_groups:
    group_abundances = relative_abundances.loc[group]
    percentage_changes[group] = ((group_abundances / reference_abundances) - 1) * 100

# Save the percentage changes as a CSV file
percentage_changes_df = pd.DataFrame(percentage_changes).round(2)
percentage_changes_df.index.name = 'Cell Type'
#percentage_changes_df.to_csv('/lustre/groups/talaveralopez/workspace/anna.maguza/COPD/Percentage_change/copd_iav_percentage_changes.csv')

# Print the percentage changes
print(percentage_changes_df)

### copd_ctrl

In [None]:
# Calculate the percentage increase/decrease of each cell type in copd_iav relative to each other group separately
other_groups = ['healthy_ctrl', 'healthy_iav', 'copd_iav']
reference_abundances = relative_abundances.loc[other_groups].mean(axis=0)
percentage_changes = {}
for group in other_groups:
    group_abundances = relative_abundances.loc[group]
    percentage_changes[group] = ((group_abundances / reference_abundances) - 1) * 100

# Save the percentage changes as a CSV file
percentage_changes_df = pd.DataFrame(percentage_changes).round(2)
percentage_changes_df.index.name = 'Cell Type'
#percentage_changes_df.to_csv('/lustre/groups/talaveralopez/workspace/anna.maguza/COPD/Percentage_change/copd_control_percentage_changes.csv')

# Print the percentage changes
print(percentage_changes_df)

### healthy_ctrl

In [None]:
# Calculate the percentage increase/decrease of each cell type in copd_iav relative to each other group separately
other_groups = ['copd_ctrl', 'healthy_iav', 'copd_iav']
reference_abundances = relative_abundances.loc[other_groups].mean(axis=0)
percentage_changes = {}
for group in other_groups:
    group_abundances = relative_abundances.loc[group]
    percentage_changes[group] = ((group_abundances / reference_abundances) - 1) * 100

# Save the percentage changes as a CSV file
percentage_changes_df = pd.DataFrame(percentage_changes).round(2)
percentage_changes_df.index.name = 'Cell Type'
#percentage_changes_df.to_csv('/lustre/groups/talaveralopez/workspace/anna.maguza/COPD/Percentage_change/healthy_control_percentage_changes.csv')

# Print the percentage changes
print(percentage_changes_df)

### healthy_iav

In [None]:
# Calculate the percentage increase/decrease of each cell type in copd_iav relative to each other group separately
other_groups = ['copd_ctrl', 'healthy_ctrl', 'copd_iav']
reference_abundances = relative_abundances.loc[other_groups].mean(axis=0)
percentage_changes = {}
for group in other_groups:
    group_abundances = relative_abundances.loc[group]
    percentage_changes[group] = ((group_abundances / reference_abundances) - 1) * 100

# Save the percentage changes as a CSV file
percentage_changes_df = pd.DataFrame(percentage_changes).round(2)
percentage_changes_df.index.name = 'Cell Type'
#percentage_changes_df.to_csv('/lustre/groups/talaveralopez/workspace/anna.maguza/COPD/Percentage_change/healthy_iav_percentage_changes.csv')

# Print the percentage changes
print(percentage_changes_df)

In [None]:
# Calculate the percentage increase/decrease of each cell type in copd_iav relative to each other group separately
other_groups = ['healthy_ctrl', 'healthy_iav', 'copd_ctrl']
reference_abundances = relative_abundances.loc[other_groups].mean(axis=0)
percentage_changes = {}
for group in other_groups:
    group_abundances = relative_abundances.loc[group]
    percentage_changes[group] = ((group_abundances / reference_abundances) - 1) * 100

# Print the percentage increase/decrease of each cell type in copd_iav relative to each other group separately
for group, changes in percentage_changes.items():
    print(f"Copd_iav has {changes.loc[changes.index!='nan'].mean().round(2)}% more {group} cells than other groups on average")
    print(changes.loc[changes.index!='nan'].round(2))
    print()

# ANOVA TEST

In [None]:
import statsmodels as stats
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [None]:
adata.obs

In [None]:
# Get the cell type and condition information
obs_df = adata.obs[['group', 'leiden_states']]

In [None]:
# Calculate the proprtions of each cell type in each condition
cell_type_proportions = obs_df.groupby('group')['leiden_states'].value_counts(normalize=True).unstack().fillna(0)

In [None]:
# Calculate the proprtions of each cell type in each condition
cell_type_proportions = obs_df.groupby('group')['leiden_states'].value_counts().unstack().fillna(0)

In [None]:
# Perform one-way ANOVA to test for differences in cell type proportions between conditions
f_stats, p_value = f_oneway(*[cell_type_proportions.loc[group] for group in cell_type_proportions.index])
print('ANOVA F-Statisic:', f_stats)
print('ANOVA p-value:', p_value)

In [None]:
# Perform Tukey's HSD test to test for differences in cell type proportions between conditions
cell_type_proportions_flat = cell_type_proportions.stack().reset_index()
cell_type_proportions_flat.columns = ['group', 'leiden_states', 'proportion']

tukey_results = pairwise_tukeyhsd(cell_type_proportions_flat['proportion'], cell_type_proportions_flat['group'])


In [None]:
# perform Tukey's HSD test for each cell type separately
cell_types = cell_type_proportions_flat['leiden_states'].unique()

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import MultiComparison, pairwise_tukeyhsd


In [None]:
# perform Tukey's HSD test for each cell type separately
cell_types = cell_type_proportions_flat['leiden_states'].unique()
for cell_type in cell_types:
    df = cell_type_proportions_flat[cell_type_proportions_flat['leiden_states'] == cell_type]
    mc = MultiComparison(df['leiden_states'], df['group'])
    result = mc.tukeyhsd()
    print(f"Tukey's HSD test for {cell_type}:")
    print(result)
    print()

In [None]:
print(tukey_results)

# New trial

In [30]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import MultiComparison, pairwise_tukeyhsd


In [31]:
# create a new DataFrame with the relevant data
data = pd.DataFrame({'group': adata.obs['group'], 'cell_type': adata.obs['leiden_states']})

In [32]:
# convert the group and cell_type columns to numerical labels
data['group_id'] = data['group'].astype('category').cat.codes
data['cell_type'] = data['cell_type'].astype('category')
data['cell_type_id'] = data['cell_type'].cat.codes

In [33]:
# perform two-way ANOVA
formula = 'cell_type_id ~ group_id + cell_type_id + group_id:cell_type_id'
model = ols(formula, data).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

In [34]:
# perform Tukey's HSD test for each cell type separately
cell_types = data['cell_type'].unique()
for cell_type in cell_types:
    df = data[data['cell_type'] == cell_type]
    mc = MultiComparison(df['cell_type_id'], df['group'])
    result = mc.tukeyhsd()
    print(f"Tukey's HSD test for {cell_type}:")
    print(result)
    print()

  st_range = np.abs(meandiffs) / std_pairs #studentized range statistic


Tukey's HSD test for SupraB_14:
    Multiple Comparison of Means - Tukey HSD, FWER=0.05    
   group1       group2    meandiff p-adj lower upper reject
-----------------------------------------------------------
   copd_ctrl     copd_iav      0.0   nan   0.0   0.0  False
   copd_ctrl healthy_ctrl      0.0   nan   0.0   0.0  False
   copd_ctrl  healthy_iav      0.0   nan   0.0   0.0  False
    copd_iav healthy_ctrl      0.0   nan   0.0   0.0  False
    copd_iav  healthy_iav      0.0   nan   0.0   0.0  False
healthy_ctrl  healthy_iav      0.0   nan   0.0   0.0  False
-----------------------------------------------------------

Tukey's HSD test for Club_1:
    Multiple Comparison of Means - Tukey HSD, FWER=0.05    
   group1       group2    meandiff p-adj lower upper reject
-----------------------------------------------------------
   copd_ctrl     copd_iav      0.0   nan   0.0   0.0  False
   copd_ctrl healthy_ctrl      0.0   nan   0.0   0.0  False
   copd_ctrl  healthy_iav      0.0   n

# Trial 3

In [None]:
# Get the cell type and condition information
obs_df = adata.obs[['group', 'leiden_states']]

# Calculate the proprtions of each cell type in each condition
cell_types_number = obs_df.groupby('group')['leiden_states'].value_counts().unstack().fillna(0)

# Perform Tukey's HSD test to test for differences in cell type numbers between conditions
cell_types_number_flat = cell_types_number.stack().reset_index()
cell_types_number_flat.columns = ['index', 'leiden_states', 'value']