# Baseline model

In [1]:
# !pip install scanpy

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
from sklearn.linear_model import LogisticRegression

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
merfish_ad = sc.read_h5ad('/content/gdrive/MyDrive/CS273 Group Project/SEAAD_MTG_MERFISH.2024-02-13.h5ad')
merfish_ad

AnnData object with n_obs × n_vars = 366272 × 180
    obs: 'Donor ID', 'Sex', 'Gender', 'Age at Death', 'Race (choice=White)', 'Race (choice=Black/ African American)', 'Race (choice=Asian)', 'Race (choice=American Indian/ Alaska Native)', 'Race (choice=Native Hawaiian or Pacific Islander)', 'Race (choice=Unknown or unreported)', 'Race (choice=Other)', 'specify other race', 'Hispanic/Latino', 'Highest level of education', 'Years of education', 'PMI', 'Fresh Brain Weight', 'Brain pH', 'Overall AD neuropathological Change', 'Thal', 'Braak', 'CERAD score', 'Overall CAA Score', 'Highest Lewy Body Disease', 'Total Microinfarcts (not observed grossly)', 'Total microinfarcts in screening sections', 'Atherosclerosis', 'Arteriolosclerosis', 'LATE', 'Cognitive Status', 'Last CASI Score', 'Interval from last CASI in months', 'Last MMSE Score', 'Interval from last MMSE in months', 'Last MOCA Score', 'Interval from last MOCA in months', 'APOE4 Status', 'Primary Study Name', 'Secondary Study Name', '

In [5]:
merfish_ad.obs[['Donor ID', 'Subclass', 'Cognitive Status', 'LATE']]

Unnamed: 0,Donor ID,Subclass,Cognitive Status,LATE
358139,H21.33.006,Lamp5,No dementia,Not Identified
360182,H20.33.001,L6b,No dementia,LATE Stage 2
87010,H20.33.012,Endothelial,No dementia,Not Identified
158386,H20.33.044,Astrocyte,No dementia,LATE Stage 1
79159,H21.33.011,OPC,No dementia,LATE Stage 2
...,...,...,...,...
52408,H21.33.001,Oligodendrocyte,Dementia,Not Identified
23676,H20.33.035,Microglia-PVM,No dementia,LATE Stage 2
361755,H20.33.001,Astrocyte,No dementia,LATE Stage 2
235090,H21.33.011,L2/3 IT,No dementia,LATE Stage 2


In [6]:
donor_id_cell_type_one_hot = pd.get_dummies(merfish_ad.obs[['Donor ID', 'Subclass']],
                                            columns=['Subclass'])
donor_id_cell_type_counts = donor_id_cell_type_one_hot.groupby('Donor ID').sum()
donor_id_cell_type_proportions = (donor_id_cell_type_counts.T / donor_id_cell_type_counts.sum(axis=1)).T
donor_id_cell_type_proportions.head()

Unnamed: 0_level_0,Subclass_Astrocyte,Subclass_Chandelier,Subclass_Endothelial,Subclass_L2/3 IT,Subclass_L4 IT,Subclass_L5 ET,Subclass_L5 IT,Subclass_L5/6 NP,Subclass_L6 CT,Subclass_L6 IT,...,Subclass_Microglia-PVM,Subclass_OPC,Subclass_Oligodendrocyte,Subclass_Pax6,Subclass_Pvalb,Subclass_Sncg,Subclass_Sst,Subclass_Sst Chodl,Subclass_VLMC,Subclass_Vip
Donor ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
H20.33.001,0.101699,0.002379,0.056161,0.072848,0.062695,0.005596,0.045438,0.007238,0.020239,0.05539,...,0.158362,0.043729,0.173307,0.001675,0.016185,0.002781,0.011259,0.001139,0.041718,0.01146
H20.33.004,0.093713,0.003637,0.052918,0.109931,0.06692,0.004389,0.057808,0.00882,0.009196,0.038915,...,0.230605,0.034651,0.137352,0.001547,0.019771,0.003887,0.016302,0.000669,0.043471,0.01743
H20.33.012,0.157184,0.00333,0.070885,0.150999,0.105519,0.003806,0.078211,0.010657,0.011893,0.052521,...,0.068126,0.044624,0.053378,0.001808,0.022455,0.004757,0.010086,0.001998,0.052331,0.021503
H20.33.015,0.122774,0.002849,0.043709,0.116752,0.072654,0.004144,0.044357,0.006605,0.010231,0.038464,...,0.110277,0.045976,0.231691,0.002266,0.012692,0.004727,0.008353,0.001295,0.041248,0.011202
H20.33.025,0.139682,0.002877,0.059771,0.15155,0.068978,0.003956,0.067252,0.010573,0.008128,0.026901,...,0.088326,0.032223,0.145652,0.002086,0.031936,0.007624,0.029274,0.001798,0.040711,0.026038


## Predicting cognitive status

In [7]:
donor_id_cognitive_status = merfish_ad.obs[['Donor ID', 'Cognitive Status']].groupby(['Donor ID'])['Cognitive Status'].agg(pd.Series.mode)
donor_id_cognitive_status

Donor ID
H20.33.001       Dementia
H20.33.004       Dementia
H20.33.012    No dementia
H20.33.015       Dementia
H20.33.025    No dementia
H20.33.035    No dementia
H20.33.040       Dementia
H20.33.044    No dementia
H21.33.001       Dementia
H21.33.005       Dementia
H21.33.006    No dementia
H21.33.011    No dementia
H21.33.012       Dementia
H21.33.013    No dementia
H21.33.014    No dementia
H21.33.015    No dementia
H21.33.016       Dementia
H21.33.019    No dementia
H21.33.021       Dementia
H21.33.022    No dementia
H21.33.023    No dementia
H21.33.025    No dementia
H21.33.028    No dementia
H21.33.031       Dementia
H21.33.032    No dementia
H21.33.038    No dementia
H21.33.040    No dementia
Name: Cognitive Status, dtype: object

In [8]:
donor_id_cognitive_status.value_counts(normalize=True)

Cognitive Status
No dementia    0.62963
Dementia       0.37037
Name: proportion, dtype: float64

In [9]:
# donor_ids = merfish_ad.obs['Donor ID'].unique()

# np.random.seed(0)
# donor_ids_perm = np.random.permutation(donor_ids)
# donor_ids_train = donor_ids_perm[:round(0.7*len(donor_ids))]
# donor_ids_test = donor_ids_perm[round(0.7*len(donor_ids)):]

In [10]:
donor_ids = np.array(sorted(merfish_ad.obs['Donor ID'].unique()))
donor_ids_train = np.load('/content/gdrive/MyDrive/CS273 Group Project/data/train_id.npy')
donor_ids_test = np.load('/content/gdrive/MyDrive/CS273 Group Project/data/test_id.npy')

donor_ids_train = donor_ids[donor_ids_train]
donor_ids_test = donor_ids[donor_ids_test]

In [11]:
cell_type_proportions_train = donor_id_cell_type_proportions.loc[donor_ids_train]
cognitive_status_train = donor_id_cognitive_status.loc[donor_ids_train]
print(cell_type_proportions_train.shape)
print(cognitive_status_train.shape)

cell_type_proportions_test = donor_id_cell_type_proportions.loc[donor_ids_test]
cognitive_status_test = donor_id_cognitive_status.loc[donor_ids_test]
print(cell_type_proportions_test.shape)
print(cognitive_status_test.shape)

(18, 24)
(18,)
(9, 24)
(9,)


In [12]:
model = LogisticRegression().fit(cell_type_proportions_train, cognitive_status_train)

training_accuracy = model.score(cell_type_proportions_train, cognitive_status_train)
print(f'Training accuracy: {training_accuracy:.3f}')

test_accuracy = model.score(cell_type_proportions_test, cognitive_status_test)
print(f'Test accuracy: {test_accuracy:.3f}')

Training accuracy: 0.722
Test accuracy: 0.444


In [13]:
cognitive_status_test.value_counts(normalize=True)

Cognitive Status
Dementia       0.555556
No dementia    0.444444
Name: proportion, dtype: float64

## Predicting LATE status

In [None]:
merfish_ad.obs['LATE'].value_counts(normalize=True)

LATE
LATE Stage 2      0.380373
Not Identified    0.324871
LATE Stage 1      0.236685
LATE Stage 3      0.058072
Name: proportion, dtype: float64

In [None]:
# merfish_ad = merfish_ad[merfish_ad.obs['LATE'] != 'Not Identified']
# merfish_ad

In [None]:
donor_id_cell_type_one_hot = pd.get_dummies(merfish_ad.obs[['Donor ID', 'Subclass']],
                                            columns=['Subclass'])
donor_id_cell_type_counts = donor_id_cell_type_one_hot.groupby('Donor ID').sum()
donor_id_cell_type_proportions = (donor_id_cell_type_counts.T / donor_id_cell_type_counts.sum(axis=1)).T
donor_id_cell_type_proportions.head()

Unnamed: 0_level_0,Subclass_Astrocyte,Subclass_Chandelier,Subclass_Endothelial,Subclass_L2/3 IT,Subclass_L4 IT,Subclass_L5 ET,Subclass_L5 IT,Subclass_L5/6 NP,Subclass_L6 CT,Subclass_L6 IT,...,Subclass_Microglia-PVM,Subclass_OPC,Subclass_Oligodendrocyte,Subclass_Pax6,Subclass_Pvalb,Subclass_Sncg,Subclass_Sst,Subclass_Sst Chodl,Subclass_VLMC,Subclass_Vip
Donor ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
H20.33.001,0.101699,0.002379,0.056161,0.072848,0.062695,0.005596,0.045438,0.007238,0.020239,0.05539,...,0.158362,0.043729,0.173307,0.001675,0.016185,0.002781,0.011259,0.001139,0.041718,0.01146
H20.33.004,0.093713,0.003637,0.052918,0.109931,0.06692,0.004389,0.057808,0.00882,0.009196,0.038915,...,0.230605,0.034651,0.137352,0.001547,0.019771,0.003887,0.016302,0.000669,0.043471,0.01743
H20.33.012,0.157184,0.00333,0.070885,0.150999,0.105519,0.003806,0.078211,0.010657,0.011893,0.052521,...,0.068126,0.044624,0.053378,0.001808,0.022455,0.004757,0.010086,0.001998,0.052331,0.021503
H20.33.015,0.122774,0.002849,0.043709,0.116752,0.072654,0.004144,0.044357,0.006605,0.010231,0.038464,...,0.110277,0.045976,0.231691,0.002266,0.012692,0.004727,0.008353,0.001295,0.041248,0.011202
H20.33.025,0.139682,0.002877,0.059771,0.15155,0.068978,0.003956,0.067252,0.010573,0.008128,0.026901,...,0.088326,0.032223,0.145652,0.002086,0.031936,0.007624,0.029274,0.001798,0.040711,0.026038


In [None]:
donor_id_late_status = merfish_ad.obs[['Donor ID', 'LATE']].groupby(['Donor ID'])['LATE'].agg(pd.Series.mode)
donor_id_late_status

Donor ID
H20.33.001    Not Identified
H20.33.004      LATE Stage 1
H20.33.012    Not Identified
H20.33.015      LATE Stage 3
H20.33.025      LATE Stage 2
H20.33.035      LATE Stage 2
H20.33.040    Not Identified
H20.33.044      LATE Stage 1
H21.33.001    Not Identified
H21.33.005      LATE Stage 3
H21.33.006    Not Identified
H21.33.011      LATE Stage 2
H21.33.012      LATE Stage 2
H21.33.013    Not Identified
H21.33.014      LATE Stage 2
H21.33.015      LATE Stage 2
H21.33.016      LATE Stage 2
H21.33.019    Not Identified
H21.33.021    Not Identified
H21.33.022      LATE Stage 1
H21.33.023      LATE Stage 1
H21.33.025    Not Identified
H21.33.028    Not Identified
H21.33.031      LATE Stage 2
H21.33.032      LATE Stage 2
H21.33.038    Not Identified
H21.33.040      LATE Stage 1
Name: LATE, dtype: object

In [None]:
donor_id_late_status.value_counts(normalize=True)

LATE
Not Identified    0.407407
LATE Stage 2      0.333333
LATE Stage 1      0.185185
LATE Stage 3      0.074074
Name: proportion, dtype: float64

In [None]:
# donor_ids = merfish_ad.obs['Donor ID'].unique()

# np.random.seed(0)
# donor_ids_perm = np.random.permutation(donor_ids)
# donor_ids_train = donor_ids_perm[:round(0.7*len(donor_ids))]
# donor_ids_test = donor_ids_perm[round(0.7*len(donor_ids)):]

In [None]:
cell_type_proportions_train = donor_id_cell_type_proportions.loc[donor_ids_train]
late_status_train = donor_id_late_status.loc[donor_ids_train]
print(cell_type_proportions_train.shape)
print(late_status_train.shape)

cell_type_proportions_test = donor_id_cell_type_proportions.loc[donor_ids_test]
late_status_test = donor_id_late_status.loc[donor_ids_test]
print(cell_type_proportions_test.shape)
print(late_status_test.shape)

(18, 24)
(18,)
(9, 24)
(9,)


In [None]:
model = LogisticRegression().fit(cell_type_proportions_train, late_status_train)

training_accuracy = model.score(cell_type_proportions_train, late_status_train)
print(f'Training accuracy: {training_accuracy:.3f}')

test_accuracy = model.score(cell_type_proportions_test, late_status_test)
print(f'Test accuracy: {test_accuracy:.3f}')

Training accuracy: 0.611
Test accuracy: 0.667


In [25]:
late_status_test.value_counts(normalize=True)

LATE
Not Identified    0.555556
LATE Stage 2      0.333333
LATE Stage 3      0.111111
Name: proportion, dtype: float64