# Index


In [60]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import os
from tqdm import tqdm

In [None]:
from biomart import BiomartServer

# Connect to the Ensembl Biomart server
server = BiomartServer("http://www.ensembl.org/biomart")

# Access the Ensembl Genes dataset for Homo sapiens
dataset = server.datasets['hsapiens_gene_ensembl']

# Query the dataset for Ensembl IDs and gene names
response = dataset.search({
    'attributes': [
        'ensembl_gene_id', 
        'external_gene_name'
    ]
})

# Convert the response to a dataframe
import pandas as pd
from io import StringIO

data_ensemble_gene = StringIO(response.text)

df = pd.read_csv(data_ensemble_gene , sep="\t",header = None)

# Display the first few rows of the dataframe
print(df.head())
# Nice

import numpy as np
def return_ensemble_id(x):
    
    try:
        return df[df[1]==x][0].values[0]
    except: 
        return np.nan

In [None]:
import pickle
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
# Load the DataFrame from the pickle file
#data = pd.read_csv('../preprocessed_data/mat.csv')
l = 0
for j in range(1,11):
    for i in os.listdir('../../preprocessed_data/inhibitory_neuron/split_chunks/'):
        try:
            if i.split('.')[1] == 'pkl':
                print(i)
                pickle_file_path = '../../preprocessed_data/inhibitory_neuron/split_chunks/' + i
                if l == 0: 
                    data = pd.read_pickle(pickle_file_path)
                    # to load data in the RAM (not possible to load all data once)
                    selected_columns = np.random.choice(data.columns, size=int(len(data.columns)*0.5), replace=False)
                    data = data[selected_columns]
                else:
                    df = pd.read_pickle(pickle_file_path)
                    df_selected = df[selected_columns] # selected columns coming from first time loaded data
                    data = pd.concat([data,df_selected],axis = 0)
                
                print("Pickle file has been loaded into a DataFrame:")
                l = l + 1
                break
        except:
            continue

    single_cell_metadata = pd.read_csv('../../preprocessed_data/inhibitory_neuron/metadata_inhibitory_neurons.csv', index_col = 0)
        
    single_cell_metadata = single_cell_metadata[single_cell_metadata.clinical_pathological_AD != 'False']
    single_cell_metadata = single_cell_metadata[single_cell_metadata.index.isin(data.columns)]
    data = data[single_cell_metadata.index.tolist()]
    x = data.reset_index()['index'].apply(return_ensemble_id)
    
    data['ensemble_gene_name'] = x.tolist()
    data = data.dropna()
        
    data.set_index('ensemble_gene_name', inplace = True)
    clinical_data = pd.read_csv('/12tb_dsk1/danish/preprocessed_data/clinical/clinical_single_cell.csv')
    single_cell_metadata.reset_index(inplace = True)
    single_cell_metadata = pd.merge(single_cell_metadata,clinical_data[['subject','pmi_df2','age_death','msex','educ','age_first_ad_dx','cogdx','ceradsc','braaksc','apoe_genotype']], on = 'subject', how = 'left')
    single_cell_metadata = single_cell_metadata.drop(columns=['apoe_genotype'])         
    single_cell_metadata = single_cell_metadata[~single_cell_metadata.pmi_df2.isna()]
    single_cell_metadata.drop_duplicates(inplace = True)
    data = data.T

    covariates_dataframe = pd.merge(data.reset_index()['index'],single_cell_metadata[['cell_id','pmi_df2','age_death','msex','educ','cogdx','ceradsc','braaksc',
                                                                                                ]] , left_on='index', right_on='cell_id', how='inner')
    data = data[data.index.isin(covariates_dataframe.cell_id)]
    data = data.sort_index()
    try:
        covariates_dataframe.drop(columns=['index'],inplace = True)
    except:
        print('no such columns, probably I am running this snippet second time')
    covariates_dataframe.drop_duplicates(inplace = True)
    covariates_dataframe.set_index('cell_id', inplace = True)
    covariates_dataframe = covariates_dataframe.sort_index()
    data = data.sort_index()

    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
            
            # Perform PCA
    pca = PCA(n_components=100)  # Choose number of components
    principal_components = pca.fit_transform(scaled_data)
            
    df_explained_variance_ratio = pd.DataFrame(pca.explained_variance_ratio_, columns=['Explained Variance Ratio'])
    
    loadings = pca.components_
    loadings_df = pd.DataFrame(loadings.T, columns=[f'PC{i+1}' for i in range(loadings.shape[0])], index=data.columns)

    loadings_df.to_csv(f'../../../../usman/Single_Cell_Microglia_Project/results/inhibitory_neuron/loadings_pca_output_{j}.txt', sep='\t')
    df_explained_variance_ratio.to_csv(f'../../../../usman/Single_Cell_Microglia_Project/results/inhibitory_neuron/pca_explained_variance_{j}.txt', sep='\t')    

In [None]:
        # Assuming merged_data_clinical_pathological is your data matrix (samples x features)
        # and labels is a list/array of labels corresponding to each sample
        
        # Standardize the data (important for PCA)



        #correlation_matrix = np.corrcoef(principal_components.T, covariates_dataframe.T)[:principal_components.shape[1], principal_components.shape[1]:]
        #print(loadings_df)
        
        #fig, ax1 = plt.subplots(figsize=(12, 6))
        
        # Plot Heatmap
        #sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', xticklabels=covariates_dataframe.columns,
                    #yticklabels=[f'PC {i+1}' for i in range(principal_components.shape[1])], ax=ax1)
        #ax1.set_title('Correlation between PCs and Covariates') 
        #plt.savefig(f'../../../../usman/Single_Cell_Microglia_Project/results/inhibitory_neuron/correlation_heatmap_{j}.png')
