In [1]:
import pandas as pd
import pickle
import numpy as np
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split


In [2]:
!pwd

/12tb_dsk1/danish/Pytorch_Biologically_Informed_Neural_Network


# loading Ensemble_file

In [3]:
df_ensemble = pd.read_csv('df_ensemble.csv')
df_ensemble

Unnamed: 0,0,1
0,ENSG00000210049,MT-TF
1,ENSG00000211459,MT-RNR1
2,ENSG00000210077,MT-TV
3,ENSG00000210082,MT-RNR2
4,ENSG00000209082,MT-TL1
...,...,...
70606,ENSG00000232679,LINC01705
70607,ENSG00000200033,RNU6-403P
70608,ENSG00000228437,LINC02474
70609,ENSG00000229463,LYST-AS1


In [4]:

def return_ensemble_id(x):
    try:
        return df_ensemble[df_ensemble['1']==x]['0'].values[0]
    except: 
        return np.nan

# CSV reading

In [5]:
def save_train_test(subtype_name):

    print(subtype_name)

    data = pd.read_csv(f'../../usman/Single_Cell_Microglia_Project/preprocessed_data/inhibitory_neuron/subtype_inhibitory/{subtype_name}/count_matrix.csv',index_col = 0)
    x = data.reset_index()['index'].apply(return_ensemble_id)
    data['ensemble_gene_name'] = x.tolist()
    data = data.dropna()
    data.set_index('ensemble_gene_name', inplace = True)
    data = data.T
    clinical_data = pd.read_csv('../../danish/preprocessed_data/clinical/clinical_single_cell.csv')
    
    meta_data = pd.read_csv(f'../../usman/Single_Cell_Microglia_Project/preprocessed_data/inhibitory_neuron/subtype_inhibitory/{subtype_name}/metadata.csv',index_col = 0)
    meta_data = meta_data.sort_values(by = ['cell_id'])
    meta_data = pd.merge(meta_data,clinical_data[['subject','clinical_pathological_AD']], on = 'subject', how = 'left')
    meta_data.drop_duplicates(inplace=True)
    meta_data.set_index('cell_id',inplace = True)
    meta_data.sort_index(inplace= True)

    meta_data = meta_data[meta_data.clinical_pathological_AD != 'False']

    meta_data_NCI = meta_data[meta_data.clinical_pathological_AD == 'NCI_with_No_Plaques'][0:min(meta_data.clinical_pathological_AD.value_counts().tolist())]
    meta_data_AD = meta_data[meta_data.clinical_pathological_AD == 'AD_with_Plaques'][0:min(meta_data.clinical_pathological_AD.value_counts().tolist())]
    meta_data = pd.concat([meta_data_NCI,meta_data_AD])
    data = data.loc[meta_data.index,]
    y = meta_data.clinical_pathological_AD.tolist()
    X = data


    # Step 1: Split data into training and temporary dataset (which will be further split into testing and validation)
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.40, random_state=42)  # 40% of data will be split into testing and validation
    
    # Step 2: Split the temporary dataset into testing and validation
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)  # 50% of the temporary set will be for validation and 50% for testing
    
    y_train = pd.DataFrame(y_train)
    y_test = pd.DataFrame(y_test)
    y_val = pd.DataFrame(y_val)

    try:
        diagnosis_mapping = {'AD_with_Plaques': 1, 'NCI_with_No_Plaques': 0}
        y_test['diagnosis_binary'] = y_test[0].map(diagnosis_mapping)
        
        # Display the first few rows to verify the mapping
        print(y_test.head())
        y_test.drop(columns=[0], inplace=True)
        y_test
    
    except:
        print('Previously Run Before')



    try:
        diagnosis_mapping = {'AD_with_Plaques': 1, 'NCI_with_No_Plaques': 0}
        y_train['diagnosis_binary'] = y_train[0].map(diagnosis_mapping)
        
        # Display the first few rows to verify the mapping
        
        y_train.drop(columns=[0], inplace=True)
        y_train
    
    except:
        print('Previously Run Before')



    
    try:
        diagnosis_mapping = {'AD_with_Plaques': 1, 'NCI_with_No_Plaques': 0}
        y_val['diagnosis_binary'] = y_val[0].map(diagnosis_mapping)
        # Display the first few rows to verify the mapping
        y_val.drop(columns=[0], inplace=True)
        y_val
    
    except:
        print('Previously Run Before')


    y_train.rename(columns = {'diagnosis_binary':'diagnosis'}, inplace=True)
    y_test.rename(columns = {'diagnosis_binary':'diagnosis'}, inplace=True)
    y_val.rename(columns = {'diagnosis_binary':'diagnosis'}, inplace=True)

    try:
        os.makedirs(f'./Preprocessed_data/excitory_neurons/{subtype_name}')
    except:
        print('directory already created')

    
    X_train.to_csv(f'./Preprocessed_data/excitory_neurons/{subtype_name}/train.csv')
    X_test.to_csv(f'./Preprocessed_data/excitory_neurons/{subtype_name}/test.csv')
    X_val.to_csv(f'./Preprocessed_data/excitory_neurons/{subtype_name}/val.csv')

    y_val.to_csv(f'./Preprocessed_data/excitory_neurons/{subtype_name}/y_val.csv', index = False)
    y_test.to_csv(f'./Preprocessed_data/excitory_neurons/{subtype_name}/y_test.csv', index = False)
    y_train.to_csv(f'./Preprocessed_data/excitory_neurons/{subtype_name}/y_train.csv', index = False)
    meta_data.to_csv(f'./Preprocessed_data/excitory_neurons/{subtype_name}/metadata.csv')



    

In [6]:
for i in tqdm(os.listdir('../../usman/Single_Cell_Microglia_Project/preprocessed_data/inhibitory_neuron/subtype_inhibitory/')):
    save_train_test(i)

  0%|                                                                                                                                              | 0/25 [00:00<?, ?it/s]

Inh_GPC5_RIT2
                     0  diagnosis_binary
0      AD_with_Plaques                 1
1  NCI_with_No_Plaques                 0
2  NCI_with_No_Plaques                 0
3      AD_with_Plaques                 1
4  NCI_with_No_Plaques                 0


  4%|█████▎                                                                                                                                | 1/25 [01:14<29:56, 74.87s/it]

Inh_VIP_ABI3BP
                     0  diagnosis_binary
0  NCI_with_No_Plaques                 0
1      AD_with_Plaques                 1
2      AD_with_Plaques                 1
3  NCI_with_No_Plaques                 0
4  NCI_with_No_Plaques                 0


  8%|██████████▋                                                                                                                           | 2/25 [02:48<32:50, 85.69s/it]

Inh_L6_SST_NPY
                     0  diagnosis_binary
0  NCI_with_No_Plaques                 0
1      AD_with_Plaques                 1
2      AD_with_Plaques                 1
3      AD_with_Plaques                 1
4  NCI_with_No_Plaques                 0


 12%|████████████████                                                                                                                      | 3/25 [03:57<28:43, 78.32s/it]

Inh_ENOX2_SPHKAP
                     0  diagnosis_binary
0      AD_with_Plaques                 1
1  NCI_with_No_Plaques                 0
2  NCI_with_No_Plaques                 0
3  NCI_with_No_Plaques                 0
4  NCI_with_No_Plaques                 0


 16%|█████████████████████▍                                                                                                                | 4/25 [05:25<28:39, 81.88s/it]

Inh_SORCS1_TTN
                     0  diagnosis_binary
0      AD_with_Plaques                 1
1      AD_with_Plaques                 1
2  NCI_with_No_Plaques                 0
3      AD_with_Plaques                 1
4  NCI_with_No_Plaques                 0


 20%|██████████████████████████▊                                                                                                           | 5/25 [06:45<27:04, 81.20s/it]

Inh_ALCAM_TRPM3
                     0  diagnosis_binary
0      AD_with_Plaques                 1
1      AD_with_Plaques                 1
2      AD_with_Plaques                 1
3      AD_with_Plaques                 1
4  NCI_with_No_Plaques                 0


 24%|████████████████████████████████▏                                                                                                     | 6/25 [08:10<26:11, 82.73s/it]

Inh_PVALB_CA8__Chandelier
                 0  diagnosis_binary
0  AD_with_Plaques                 1
1  AD_with_Plaques                 1
2  AD_with_Plaques                 1
3  AD_with_Plaques                 1
4  AD_with_Plaques                 1


 28%|█████████████████████████████████████▌                                                                                                | 7/25 [09:43<25:46, 85.94s/it]

Inh_L1-6_LAMP5_CA13
                     0  diagnosis_binary
0      AD_with_Plaques                 1
1  NCI_with_No_Plaques                 0
2  NCI_with_No_Plaques                 0
3      AD_with_Plaques                 1
4  NCI_with_No_Plaques                 0


 32%|██████████████████████████████████████████▉                                                                                           | 8/25 [11:21<25:26, 89.77s/it]

Inh_PVALB_SULF1
                     0  diagnosis_binary
0  NCI_with_No_Plaques                 0
1      AD_with_Plaques                 1
2      AD_with_Plaques                 1
3      AD_with_Plaques                 1
4      AD_with_Plaques                 1
                     0  diagnosis_binary
0  NCI_with_No_Plaques                 0
1      AD_with_Plaques                 1
2  NCI_with_No_Plaques                 0
3  NCI_with_No_Plaques                 0
4      AD_with_Plaques                 1


 40%|████████████████████████████████████████████████████▊                                                                               | 10/25 [16:04<29:47, 119.16s/it]

Inh_PTPRK_FAM19A1
                     0  diagnosis_binary
0  NCI_with_No_Plaques                 0
1      AD_with_Plaques                 1
2  NCI_with_No_Plaques                 0
3  NCI_with_No_Plaques                 0
4  NCI_with_No_Plaques                 0


 44%|██████████████████████████████████████████████████████████                                                                          | 11/25 [17:29<25:22, 108.78s/it]

Inh_L5-6_PVALB_STON2
                     0  diagnosis_binary
0  NCI_with_No_Plaques                 0
1      AD_with_Plaques                 1
2  NCI_with_No_Plaques                 0
3      AD_with_Plaques                 1
4      AD_with_Plaques                 1


 48%|███████████████████████████████████████████████████████████████▊                                                                     | 12/25 [18:46<21:25, 98.90s/it]

Inh_LAMP5_RELN
                     0  diagnosis_binary
0  NCI_with_No_Plaques                 0
1      AD_with_Plaques                 1
2      AD_with_Plaques                 1
3      AD_with_Plaques                 1
4  NCI_with_No_Plaques                 0


 52%|█████████████████████████████████████████████████████████████████████▏                                                               | 13/25 [20:04<18:30, 92.55s/it]

Inh_VIP_CLSTN2
                     0  diagnosis_binary
0      AD_with_Plaques                 1
1  NCI_with_No_Plaques                 0
2  NCI_with_No_Plaques                 0
3      AD_with_Plaques                 1
4  NCI_with_No_Plaques                 0


 56%|██████████████████████████████████████████████████████████████████████████▍                                                          | 14/25 [21:42<17:15, 94.17s/it]

Inh_L1-2_PAX6_SCGN
                     0  diagnosis_binary
0  NCI_with_No_Plaques                 0
1      AD_with_Plaques                 1
2      AD_with_Plaques                 1
3      AD_with_Plaques                 1
4      AD_with_Plaques                 1


 60%|███████████████████████████████████████████████████████████████████████████████▊                                                     | 15/25 [22:49<14:21, 86.15s/it]

Inh_L3-5_SST_MAFB
                     0  diagnosis_binary
0      AD_with_Plaques                 1
1  NCI_with_No_Plaques                 0
2  NCI_with_No_Plaques                 0
3      AD_with_Plaques                 1
4      AD_with_Plaques                 1


 64%|█████████████████████████████████████████████████████████████████████████████████████                                                | 16/25 [24:38<13:56, 92.95s/it]

Inh_LAMP5_NRG1__Rosehip
                     0  diagnosis_binary
0  NCI_with_No_Plaques                 0
1      AD_with_Plaques                 1
2      AD_with_Plaques                 1
3  NCI_with_No_Plaques                 0
4      AD_with_Plaques                 1


 68%|██████████████████████████████████████████████████████████████████████████████████████████▍                                          | 17/25 [26:34<13:18, 99.86s/it]

Inh_VIP_THSD7B
                     0  diagnosis_binary
0      AD_with_Plaques                 1
1      AD_with_Plaques                 1
2  NCI_with_No_Plaques                 0
3      AD_with_Plaques                 1
4  NCI_with_No_Plaques                 0


 72%|███████████████████████████████████████████████████████████████████████████████████████████████▊                                     | 18/25 [27:53<10:54, 93.52s/it]

Inh_SGCD_PDE3A
                     0  diagnosis_binary
0      AD_with_Plaques                 1
1  NCI_with_No_Plaques                 0
2      AD_with_Plaques                 1
3  NCI_with_No_Plaques                 0
4  NCI_with_No_Plaques                 0


 76%|█████████████████████████████████████████████████████████████████████████████████████████████████████                                | 19/25 [29:05<08:43, 87.18s/it]

Inh_RYR3_TSHZ2
                     0  diagnosis_binary
0  NCI_with_No_Plaques                 0
1      AD_with_Plaques                 1
2      AD_with_Plaques                 1
3  NCI_with_No_Plaques                 0
4  NCI_with_No_Plaques                 0


 80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍                          | 20/25 [30:58<07:55, 95.02s/it]

Inh_VIP_TSHZ2
                     0  diagnosis_binary
0      AD_with_Plaques                 1
1  NCI_with_No_Plaques                 0
2      AD_with_Plaques                 1
3  NCI_with_No_Plaques                 0
4  NCI_with_No_Plaques                 0


 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 21/25 [32:26<06:11, 92.82s/it]

Inh_L1_PAX6_CA4
                     0  diagnosis_binary
0  NCI_with_No_Plaques                 0
1  NCI_with_No_Plaques                 0
2      AD_with_Plaques                 1
3  NCI_with_No_Plaques                 0
4      AD_with_Plaques                 1


 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                | 22/25 [33:39<04:20, 86.85s/it]

Inh_FBN2_EPB41L4A
                     0  diagnosis_binary
0  NCI_with_No_Plaques                 0
1  NCI_with_No_Plaques                 0
2  NCI_with_No_Plaques                 0
3  NCI_with_No_Plaques                 0
4      AD_with_Plaques                 1


 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 23/25 [34:57<02:48, 84.27s/it]

Inh_L5-6_SST_TH
                     0  diagnosis_binary
0  NCI_with_No_Plaques                 0
1      AD_with_Plaques                 1
2      AD_with_Plaques                 1
3      AD_with_Plaques                 1
4      AD_with_Plaques                 1


 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋     | 24/25 [36:10<01:20, 80.98s/it]

Inh_CUX2_MSR1
                     0  diagnosis_binary
0  NCI_with_No_Plaques                 0
1  NCI_with_No_Plaques                 0
2  NCI_with_No_Plaques                 0
3      AD_with_Plaques                 1
4  NCI_with_No_Plaques                 0


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [38:02<00:00, 91.31s/it]
