# Notebook for the statistical analysis of the different cell types abundance for COPD project

- **Developed by:** Anna Maguza
- **Institute of Computational Biology - Computational Health Department - Helmholtz Munich**
- 27th February 2022

#### Load required packages

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [2]:
import statsmodels as stats
from scipy.stats import f_oneway
import statsmodels.api as sm
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import MultiComparison, pairwise_tukeyhsd

#### Setup Cells

In [3]:
%matplotlib inline

In [4]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.3 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.10.1 pandas==1.5.3 scikit-learn==1.2.2 statsmodels==0.13.5 pynndescent==0.5.8


#### Upload Data

In [5]:
input_data = '/Users/anna.maguza/Desktop/Data/COPD_project/Marburg_All_ctl230321_leiden_states.raw.h5ad'
output_data = '/Users/anna.maguza/Desktop/Data/COPD_project/Marburg_All_ctl230321_leiden_states_output.h5ad'

In [6]:
adata = sc.read_h5ad(input_data)

## One-way ANOVA statistical analysis

In [7]:
# Get the cell type and condition information
obs_df = adata.obs[['group', 'leiden_states']]

In [8]:
# Calculate the proprtions of each cell type in each condition
cell_types_number = obs_df.groupby('group')['leiden_states'].value_counts().unstack().fillna(0)

### ANOVA test for only one cell type (SupraB_0)

In [17]:
# Drop everything except the SupraB_0 cells
filtered = cell_types_number[['SupraB_0']]

# Transpose dataframe
filtered = filtered.T

In [16]:
import scipy.stats as stats

fvalue, pvalue = stats.f_oneway(filtered['healthy_ctrl'], filtered['healthy_iav'], filtered['copd_ctrl'], filtered['copd_iav'])
print(fvalue, pvalue)

nan nan




### Anova test for each cell type separately (creating a loop)

In [18]:
#Reshape the data frame
cell_types_number_flat = cell_types_number.stack().reset_index()
cell_types_number_flat.columns = ['index', 'leiden_states', 'value']

In [19]:
# Create a list of unique cell types and patient groups
cell_types = cell_types_number_flat['leiden_states'].unique()
patient_groups = cell_types_number_flat['index'].unique()


In [20]:
# Initialize an empty dictionary to store ANOVA results
anova_results = {}

# Perform one-way ANOVA test for each cell type
for cell_type in cell_types:
    samples = []
    for group in patient_groups:
        # Filter data for the current cell type and patient group
        filtered_data = cell_types_number_flat[(cell_types_number_flat['index'] == group) & (cell_types_number_flat['leiden_states'] == cell_type)]
        
        # Append the values for the current group to the samples list
        samples.append(filtered_data['value'].values)
    
    # Perform the one-way ANOVA test and store the result in the dictionary
    f_stat, p_value = f_oneway(*samples)
    anova_results[cell_type] = (f_stat, p_value)

# Print ANOVA results
for cell_type, (f_stat, p_value) in anova_results.items():
    print(f"{cell_type}: F-statistic = {f_stat:.4f}, p-value = {p_value:.4e}")


SupraB_0: F-statistic = nan, p-value = nan
Club_1: F-statistic = nan, p-value = nan
Basal_2: F-statistic = nan, p-value = nan
Goblet_3: F-statistic = nan, p-value = nan
Goblet_4: F-statistic = nan, p-value = nan
SupraB_5: F-statistic = nan, p-value = nan
SupraB_6: F-statistic = nan, p-value = nan
SupraB_7: F-statistic = nan, p-value = nan
Goblet_8: F-statistic = nan, p-value = nan
Club_9: F-statistic = nan, p-value = nan
Club_10: F-statistic = nan, p-value = nan
Goblet_11: F-statistic = nan, p-value = nan
Goblet_12: F-statistic = nan, p-value = nan
Goblet_13: F-statistic = nan, p-value = nan
SupraB_14: F-statistic = nan, p-value = nan
SupraB_15: F-statistic = nan, p-value = nan
Basal_16: F-statistic = nan, p-value = nan
MultiC_17: F-statistic = nan, p-value = nan
MultiC_18: F-statistic = nan, p-value = nan
MultiC_19: F-statistic = nan, p-value = nan
Club_20: F-statistic = nan, p-value = nan
Goblet_21: F-statistic = nan, p-value = nan
Basal_22: F-statistic = nan, p-value = nan
Club_23: 



In [24]:
from statsmodels.stats.multicomp import MultiComparison

# Create a list of unique cell types
cell_types = cell_types_number_flat['leiden_states'].unique()

# Initialize an empty dictionary to store Tukey HSD results
tukey_hsd_results = {}

# Perform Tukey HSD test for each cell type
for cell_type in cell_types:
    # Filter data for the current cell type
    filtered_data = cell_types_number_flat[cell_types_number_flat['leiden_states'] == cell_type]

    # Perform the Tukey HSD test
    mc = MultiComparison(filtered_data['value'], filtered_data['index'])
    result = mc.tukeyhsd()
    
    # Store the result in the dictionary
    tukey_hsd_results[cell_type] = result

# Print Tukey HSD results
for cell_type, result in tukey_hsd_results.items():
    print(f"Tukey HSD test results for cell type: {cell_type}")
    print(result)
    print('\n')


Tukey HSD test results for cell type: SupraB_0
    Multiple Comparison of Means - Tukey HSD, FWER=0.05    
   group1       group2    meandiff p-adj lower upper reject
-----------------------------------------------------------
   copd_ctrl     copd_iav  -1406.0   nan   nan   nan  False
   copd_ctrl healthy_ctrl  -1575.0   nan   nan   nan  False
   copd_ctrl  healthy_iav  -1893.0   nan   nan   nan  False
    copd_iav healthy_ctrl   -169.0   nan   nan   nan  False
    copd_iav  healthy_iav   -487.0   nan   nan   nan  False
healthy_ctrl  healthy_iav   -318.0   nan   nan   nan  False
-----------------------------------------------------------


Tukey HSD test results for cell type: Club_1
    Multiple Comparison of Means - Tukey HSD, FWER=0.05    
   group1       group2    meandiff p-adj lower upper reject
-----------------------------------------------------------
   copd_ctrl     copd_iav   -584.0   nan   nan   nan  False
   copd_ctrl healthy_ctrl    105.0   nan   nan   nan  False
   cop

  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


In [27]:
# Write results to a file 
with open(r"/Users/anna.maguza/Desktop/Data/COPD_project\Tukey_HSD_test_results.txt", "w+") as f:
    for cell_type, result in tukey_hsd_results.items():
        f.write(f"Tukey HSD test results for cell type: {cell_type} \n")
        f.write(f"{result} \n")
        f.write('\n')
        