#### Load required packages

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

#### Setup Cells

In [2]:
%matplotlib inline

In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.21.5 scipy==1.7.3 pandas==1.5.1 scikit-learn==1.1.3 statsmodels==0.13.2 pynndescent==0.5.8


#### Upload Data

In [30]:
input_data = '/lustre/groups/talaveralopez/datasets/single_cell/lung/influenza/working_objects/scanvi_annotated/BrEpit_ALL_ctl221212_leiden_states.raw.h5ad'
output_data = '/lustre/groups/talaveralopez/datasets/single_cell/lung/influenza/working_objects/scanvi_annotated/BrEpit_ALL_ctl221212_leiden_states_output.raw.h5ad'

In [31]:
adata = sc.read_h5ad(input_data)

In [32]:
adata.obs

Unnamed: 0,sex,age,condition,ethnicity,PaCO2,donor,infection,disease,SMK,illumina_stimunr,...,doublet_scores,predicted_doublets,sample_group,group,_scvi_batch,_scvi_labels,C_scANVI,cell_type,leiden,leiden_states
99-ST07_CSE_CTRL-V1,male,67,CTRL,caucasian,88.8,GNR-06,CTRL,healthy,7,21_0017,...,0.019385,False,V1,healthy_ctrl,9,4,Goblet,Goblet,14,Goblet_14
208-ST07_CSE_CTRL-V1,male,67,CTRL,caucasian,88.8,GNR-06,CTRL,healthy,7,21_0017,...,0.035455,False,V1,healthy_ctrl,9,8,Suprabasal,Suprabasal,5,SupraB_5
210-ST07_CSE_CTRL-V1,male,67,CTRL,caucasian,88.8,GNR-06,CTRL,healthy,7,21_0017,...,0.012496,False,V1,healthy_ctrl,9,4,Goblet,Goblet,14,Goblet_14
278-ST07_CSE_CTRL-V1,male,67,CTRL,caucasian,88.8,GNR-06,CTRL,healthy,7,21_0017,...,0.083081,False,V1,healthy_ctrl,9,8,Suprabasal,Suprabasal,5,SupraB_5
499-ST07_CSE_CTRL-V1,male,67,CTRL,caucasian,88.8,GNR-06,CTRL,healthy,7,21_0017,...,0.111111,False,V1,healthy_ctrl,9,3,Club,Club,2,Club_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
883705-ST08_IAV-V6,male,66,IAV,caucasian,76.8,CSE-03,IAV,COPD,8,21_0507,...,0.085450,False,V6,copd_iav,3,10,Basal resting,Basal resting,1,Stromal_1
883878-ST08_IAV-V6,male,66,IAV,caucasian,76.8,CSE-03,IAV,COPD,8,21_0507,...,0.134884,False,V6,copd_iav,3,10,Club,Club,9,Goblet_9
884137-ST08_IAV-V6,male,66,IAV,caucasian,76.8,CSE-03,IAV,COPD,8,21_0507,...,0.063788,False,V6,copd_iav,3,10,Goblet,Goblet,0,Goblet_0
884341-ST08_IAV-V6,male,66,IAV,caucasian,76.8,CSE-03,IAV,COPD,8,21_0507,...,0.084048,False,V6,copd_iav,3,10,Goblet,Goblet,9,Goblet_9


In [33]:
adata.obs['leiden_states'].value_counts()

Goblet_0         12525
Stromal_1        12517
Club_2           11778
BasalR_3         11228
Goblet_4         10138
SupraB_5          7879
SupraB_6          7318
Basal_7           7260
Goblet_8          6702
Goblet_9          6588
Club_10           6536
Club_11           5586
MultiC_12         4246
MultiC_13         3798
Goblet_14         3669
Club_15           3216
Goblet_16         3178
Stromal_17        2981
SupraB_18         2583
GobletClub_19      950
MultiC_20          791
Name: leiden_states, dtype: int64

In [34]:
adata.obs['group'].value_counts()

copd_ctrl       34412
copd_iav        33003
healthy_ctrl    32767
healthy_iav     31285
Name: group, dtype: int64

In [35]:
# Count the number of cells of each cell type in each group
cell_type_counts = adata.obs.groupby(['group', 'leiden_states'])['group'].count().unstack()

In [36]:
# Calculate the total number of cells in each group
total_cell_counts = cell_type_counts.sum(axis=1)

In [37]:
# Calculate the relative abundance of each cell type in each group
relative_abundances = (cell_type_counts.T / total_cell_counts).T

### COPD_IAV

In [42]:
# Calculate the percentage increase/decrease of each cell type in copd_iav relative to each other group separately
other_groups = ['healthy_ctrl', 'healthy_iav', 'copd_ctrl']
reference_abundances = relative_abundances.loc[other_groups].mean(axis=0)
percentage_changes = {}
for group in other_groups:
    group_abundances = relative_abundances.loc[group]
    percentage_changes[group] = ((group_abundances / reference_abundances) - 1) * 100

# Save the percentage changes as a CSV file
percentage_changes_df = pd.DataFrame(percentage_changes).round(2)
percentage_changes_df.index.name = 'Cell Type'
percentage_changes_df.to_csv('/lustre/groups/talaveralopez/workspace/anna.maguza/COPD/Percentage_change/copd_iav_percentage_changes.csv')

# Print the percentage changes
print(percentage_changes_df)

               healthy_ctrl  healthy_iav  copd_ctrl
Cell Type                                          
Goblet_0              26.01       -64.01      37.99
Stromal_1            -29.73        15.66      14.07
Club_2                19.94       -34.68      14.74
BasalR_3               2.47        11.99     -14.45
Goblet_4              17.14       -35.03      17.89
SupraB_5               2.51        14.43     -16.94
SupraB_6               1.89        33.80     -35.69
Basal_7              -16.00        19.02      -3.02
Goblet_8              -0.41        -2.70       3.10
Goblet_9             -66.65        29.91      36.73
Club_10               -3.17        24.41     -21.24
Club_11               -6.75       -10.24      17.00
MultiC_12             20.78        24.48     -45.26
MultiC_13              2.56        31.54     -34.10
Goblet_14             -2.69        24.38     -21.69
Club_15               19.40       -44.40      25.00
Goblet_16            -96.58       174.81     -78.23
Stromal_17  

### copd_ctrl

In [43]:
# Calculate the percentage increase/decrease of each cell type in copd_iav relative to each other group separately
other_groups = ['healthy_ctrl', 'healthy_iav', 'copd_iav']
reference_abundances = relative_abundances.loc[other_groups].mean(axis=0)
percentage_changes = {}
for group in other_groups:
    group_abundances = relative_abundances.loc[group]
    percentage_changes[group] = ((group_abundances / reference_abundances) - 1) * 100

# Save the percentage changes as a CSV file
percentage_changes_df = pd.DataFrame(percentage_changes).round(2)
percentage_changes_df.index.name = 'Cell Type'
percentage_changes_df.to_csv('/lustre/groups/talaveralopez/workspace/anna.maguza/COPD/Percentage_change/copd_control_percentage_changes.csv')

# Print the percentage changes
print(percentage_changes_df)

               healthy_ctrl  healthy_iav  copd_iav
Cell Type                                         
Goblet_0              89.31       -45.92    -43.38
Stromal_1            -29.84        15.49     14.35
Club_2                34.24       -26.89     -7.35
BasalR_3              -6.18         2.53      3.65
Goblet_4              36.14       -24.49    -11.66
SupraB_5              -1.01        10.49     -9.48
SupraB_6              -5.66        23.88    -18.22
Basal_7              -19.80        13.64      6.16
Goblet_8               3.46         1.08     -4.54
Goblet_9             -77.96       -14.16     92.13
Club_10               -9.15        16.73     -7.59
Club_11                1.90        -1.92      0.02
MultiC_12             13.13        16.59    -29.72
MultiC_13             -2.80        24.67    -21.87
Goblet_14            -10.47        14.44     -3.97
Club_15               33.42       -37.88      4.46
Goblet_16            -98.46        23.82     74.63
Stromal_17           -44.05    

### healthy_ctrl

In [45]:
# Calculate the percentage increase/decrease of each cell type in copd_iav relative to each other group separately
other_groups = ['copd_ctrl', 'healthy_iav', 'copd_iav']
reference_abundances = relative_abundances.loc[other_groups].mean(axis=0)
percentage_changes = {}
for group in other_groups:
    group_abundances = relative_abundances.loc[group]
    percentage_changes[group] = ((group_abundances / reference_abundances) - 1) * 100

# Save the percentage changes as a CSV file
percentage_changes_df = pd.DataFrame(percentage_changes).round(2)
percentage_changes_df.index.name = 'Cell Type'
percentage_changes_df.to_csv('/lustre/groups/talaveralopez/workspace/anna.maguza/COPD/Percentage_change/healthy_control_percentage_changes.csv')

# Print the percentage changes
print(percentage_changes_df)

               copd_ctrl  healthy_iav  copd_iav
Cell Type                                      
Goblet_0           95.57       -48.99    -46.59
Stromal_1          -0.59         0.79     -0.20
Club_2             30.96       -25.45     -5.51
BasalR_3          -17.41         8.11      9.30
Goblet_4           36.62       -24.71    -11.91
SupraB_5          -14.44        17.87     -3.43
SupraB_6          -32.64        40.13     -7.49
Basal_7           -11.08         9.13      1.95
Goblet_8            5.82        -0.13     -5.69
Goblet_9          -26.41       -30.08     56.49
Club_10           -21.67        23.72     -2.05
Club_11            17.67        -9.73     -7.94
MultiC_12         -35.41        46.87    -11.47
MultiC_13         -29.36        41.00    -11.64
Goblet_14         -23.49        21.52      1.97
Club_15            36.82       -39.15      2.33
Goblet_16         -90.45        20.50     69.95
Stromal_17        -30.82        12.72     18.11
SupraB_18         -23.61        37.65   

### healthy_iav

In [46]:
# Calculate the percentage increase/decrease of each cell type in copd_iav relative to each other group separately
other_groups = ['copd_ctrl', 'healthy_ctrl', 'copd_iav']
reference_abundances = relative_abundances.loc[other_groups].mean(axis=0)
percentage_changes = {}
for group in other_groups:
    group_abundances = relative_abundances.loc[group]
    percentage_changes[group] = ((group_abundances / reference_abundances) - 1) * 100

# Save the percentage changes as a CSV file
percentage_changes_df = pd.DataFrame(percentage_changes).round(2)
percentage_changes_df.index.name = 'Cell Type'
percentage_changes_df.to_csv('/lustre/groups/talaveralopez/workspace/anna.maguza/COPD/Percentage_change/healthy_iav_percentage_changes.csv')

# Print the percentage changes
print(percentage_changes_df)

               copd_ctrl  healthy_ctrl  copd_iav
Cell Type                                       
Goblet_0           37.22         25.31    -62.53
Stromal_1          14.51        -29.47     14.96
Club_2              8.43         13.34    -21.77
BasalR_3          -14.80          2.05     12.75
Goblet_4           13.70         12.98    -26.69
SupraB_5          -10.79         10.10      0.69
SupraB_6          -24.20         20.09      4.11
Basal_7            -0.42        -13.75     14.17
Goblet_8            5.00          1.42     -6.42
Goblet_9          -10.99        -78.29     89.28
Club_10           -13.79          5.98      7.80
Club_11            16.31         -7.30     -9.01
MultiC_12         -34.45         44.61    -10.16
MultiC_13         -21.20         22.63     -1.43
Goblet_14         -16.09          4.26     11.82
Club_15            10.99          6.01    -17.00
Goblet_16         -84.18        -97.52    181.69
Stromal_17        -13.60        -33.90     47.51
SupraB_18         -1

In [41]:
# Calculate the percentage increase/decrease of each cell type in copd_iav relative to each other group separately
other_groups = ['healthy_ctrl', 'healthy_iav', 'copd_ctrl']
reference_abundances = relative_abundances.loc[other_groups].mean(axis=0)
percentage_changes = {}
for group in other_groups:
    group_abundances = relative_abundances.loc[group]
    percentage_changes[group] = ((group_abundances / reference_abundances) - 1) * 100

# Print the percentage increase/decrease of each cell type in copd_iav relative to each other group separately
for group, changes in percentage_changes.items():
    print(f"Copd_iav has {changes.loc[changes.index!='nan'].mean().round(2)}% more {group} cells than other groups on average")
    print(changes.loc[changes.index!='nan'].round(2))
    print()

Copd_iav has -3.98% more healthy_ctrl cells than other groups on average
leiden_states
Goblet_0         26.01
Stromal_1       -29.73
Club_2           19.94
BasalR_3          2.47
Goblet_4         17.14
SupraB_5          2.51
SupraB_6          1.89
Basal_7         -16.00
Goblet_8         -0.41
Goblet_9        -66.65
Club_10          -3.17
Club_11          -6.75
MultiC_12        20.78
MultiC_13         2.56
Goblet_14        -2.69
Club_15          19.40
Goblet_16       -96.58
Stromal_17      -32.39
SupraB_18         9.45
GobletClub_19    -3.80
MultiC_20        52.36
dtype: float64

Copd_iav has 13.39% more healthy_iav cells than other groups on average
leiden_states
Goblet_0         -64.01
Stromal_1         15.66
Club_2           -34.68
BasalR_3          11.99
Goblet_4         -35.03
SupraB_5          14.43
SupraB_6          33.80
Basal_7           19.02
Goblet_8          -2.70
Goblet_9          29.91
Club_10           24.41
Club_11          -10.24
MultiC_12         24.48
MultiC_13       