In [1]:
# Linear regression on ranks for top up regulated cell types with sampled tumor fraction

In [13]:
# Load packages
import re
import pandas as pd
import numpy as np
import math
import os

In [14]:
input_dir = "/mnt/DATA3/timo/preeclampsia/main_centers/body/corr_tissues_new/"
output_dir = "/mnt/DATA3/timo/preeclampsia/main_centers/body/outputs/"

In [15]:
# in meta data preeclampsia 2 with ATL concentrations samp 57 is healthy (in previous meta data it was preeclampsia)
meta_dat = pd.read_csv("/mnt/DATA3/timo/data/preeclampsia_metadata_2.tsv", sep = "\t")
meta_dat = meta_dat[["sample name", "ALT_(U/l)_high_above_33", "group"]]
meta_dat = meta_dat.rename(columns={"group": "status", "sample name" : "sample", "ALT_(U/l)_high_above_33" : "ALT"})

In [16]:
meta_dat[meta_dat['sample'] == 'samp57']

Unnamed: 0,sample,ALT,status
40,samp57,11,Healthy


In [56]:
# Create the empty DataFrame
columns = ["sample", "status", "ALT", "cell_type", "correlation", "rank"]
empty_df = pd.DataFrame(columns=columns)

for file in os.listdir(input_dir):
    sample = file.split('_')[0]
    filename = input_dir + file
    data = pd.read_csv(filename)
    data["sample"] = sample
    data = data.rename(columns={"correlation_mean": "correlation", "rank_mean" : "rank"})
    data_merged = pd.merge(data, meta_dat, on = "sample")
    data_merged = data_merged[columns]
    empty_df = pd.concat([empty_df, data_merged], ignore_index = True)
empty_df = empty_df.rename(columns={"cell_type": "cell_type_tissue"})
empty_df

  empty_df = pd.concat([empty_df, data_merged], ignore_index = True)


Unnamed: 0,sample,status,ALT,cell_type_tissue,correlation,rank
0,samp55,Healthy,,RNA.classical.monocyte.Blood,-0.190319,1
1,samp55,Healthy,,RNA.myeloid.cell.Pancreas,-0.189240,2
2,samp55,Healthy,,RNA.monocyte.Thymus,-0.186312,3
3,samp55,Healthy,,RNA.monocyte.Bone.Marrow,-0.184916,4
4,samp55,Healthy,,RNA.classical.monocyte.Spleen,-0.184744,5
...,...,...,...,...,...,...
20085,samp1,Preeclampsia,14,RNA.ciliated.epithelial.cell.Uterus,-0.011987,486
20086,samp1,Preeclampsia,14,RNA.hepatocyte.Liver,-0.009075,487
20087,samp1,Preeclampsia,14,RNA.eye.photoreceptor.cell.Eye,-0.007249,488
20088,samp1,Preeclampsia,14,RNA.lung.ciliated.cell.Lung,-0.004713,489


In [57]:
# Remove rows where the cell_type_tissue contains 'RNA.NA.' 
empty_df = empty_df[~empty_df['cell_type_tissue'].str.startswith('RNA.NA')]

In [58]:
empty_df[empty_df['sample'] == 'samp57']

Unnamed: 0,sample,status,ALT,cell_type_tissue,correlation,rank
13720,samp57,Healthy,11,RNA.monocyte.Thymus,-0.200585,1
13721,samp57,Healthy,11,RNA.macrophage.Kidney,-0.197917,2
13722,samp57,Healthy,11,RNA.mature.NK.T.cell.Spleen,-0.193533,3
13723,samp57,Healthy,11,RNA.classical.monocyte.Blood,-0.192671,4
13724,samp57,Healthy,11,RNA.monocyte.Bone.Marrow,-0.192522,5
...,...,...,...,...,...,...
14205,samp57,Healthy,11,RNA.keratinocyte.Tongue,-0.015033,486
14206,samp57,Healthy,11,RNA.eye.photoreceptor.cell.Eye,-0.013018,487
14207,samp57,Healthy,11,RNA.neutrophil.Liver,-0.012634,488
14208,samp57,Healthy,11,RNA.fast.muscle.cell.Muscle,-0.009642,489


In [59]:
# Function to replace dots before specific words and the last dot if no underscores are present
def replace_dot_before_words(s):
    # Replace dot before specific words (e.g., Bone, Small, etc.)
    s = re.sub(r'\.(?=Bone|Small|Large|Salivary|Lymph)', '_', s)
    
    # If no underscore is present, replace the last dot in the string with an underscore
    if '_' not in s:
        # Replace the last dot before the last word with an underscore
        s = re.sub(r'\.(?=[^\._]*$)', '_', s)
    
    return s

empty_df.loc[:, 'cell_type_tissue'] = empty_df['cell_type_tissue'].apply(replace_dot_before_words)

empty_df

Unnamed: 0,sample,status,ALT,cell_type_tissue,correlation,rank
0,samp55,Healthy,,RNA.classical.monocyte_Blood,-0.190319,1
1,samp55,Healthy,,RNA.myeloid.cell_Pancreas,-0.189240,2
2,samp55,Healthy,,RNA.monocyte_Thymus,-0.186312,3
3,samp55,Healthy,,RNA.monocyte_Bone.Marrow,-0.184916,4
4,samp55,Healthy,,RNA.classical.monocyte_Spleen,-0.184744,5
...,...,...,...,...,...,...
20085,samp1,Preeclampsia,14,RNA.ciliated.epithelial.cell_Uterus,-0.011987,486
20086,samp1,Preeclampsia,14,RNA.hepatocyte_Liver,-0.009075,487
20087,samp1,Preeclampsia,14,RNA.eye.photoreceptor.cell_Eye,-0.007249,488
20088,samp1,Preeclampsia,14,RNA.lung.ciliated.cell_Lung,-0.004713,489


In [60]:
# Function to clean the cell type strings
def clean_cell_type(cell_type):
    # Replace all "." and ".." with spaces
    cell_type = re.sub(r'\.\.+', ' ', cell_type)
    cell_type = cell_type.replace('.', ' ')
    
    # Remove "RNA" from the string
    cell_type = cell_type.replace("RNA", "")
     
    return cell_type

# Apply the function to the 'cell_type' column
empty_df.loc[:, 'cell_type_tissue'] = empty_df['cell_type_tissue'].apply(clean_cell_type)


empty_df

Unnamed: 0,sample,status,ALT,cell_type_tissue,correlation,rank
0,samp55,Healthy,,classical monocyte_Blood,-0.190319,1
1,samp55,Healthy,,myeloid cell_Pancreas,-0.189240,2
2,samp55,Healthy,,monocyte_Thymus,-0.186312,3
3,samp55,Healthy,,monocyte_Bone Marrow,-0.184916,4
4,samp55,Healthy,,classical monocyte_Spleen,-0.184744,5
...,...,...,...,...,...,...
20085,samp1,Preeclampsia,14,ciliated epithelial cell_Uterus,-0.011987,486
20086,samp1,Preeclampsia,14,hepatocyte_Liver,-0.009075,487
20087,samp1,Preeclampsia,14,eye photoreceptor cell_Eye,-0.007249,488
20088,samp1,Preeclampsia,14,lung ciliated cell_Lung,-0.004713,489


In [61]:
len(empty_df["cell_type_tissue"].unique())

489

In [62]:
empty_df.to_csv(output_dir + "Ranks_FCC_RNA_cell_type_tissue_preeclampsia.csv", index=False)