In [1]:
import pandas as pd
import pathlib
from sklearn.preprocessing import StandardScaler

In [2]:
data_directory = pathlib.Path("../7.collab-data/data/").resolve()
rnaseq_file = pathlib.Path(data_directory, "GSE231858_norm_counts_TPM_GRCh38.p13_NCBI.tsv.gz").resolve()
annot_file = pathlib.Path(data_directory, "Human.GRCh38.p13.annot.tsv.gz").resolve()
final_file = pathlib.Path(data_directory, "collaboration_rna_data.parquet")

rnaseq_df = pd.read_csv(rnaseq_file, sep="\t")
rnaseq_parquet = rnaseq_file.with_suffix('.parquet')
rnaseq_df.to_parquet(rnaseq_parquet, index=False)
print(rnaseq_df)

# Read the annotation file
annot_df = pd.read_csv(annot_file, sep="\t")
# Select only the necessary columns
annot_df = annot_df[['GeneID', 'Symbol', 'EnsemblGeneID']]
annot_parquet = annot_file.with_suffix('.parquet')
annot_df.to_parquet(annot_parquet, index=False)

print(annot_df)


          GeneID  GSM7305242  GSM7305243  GSM7305244  GSM7305246  GSM7305247  \
0      100287102       0.269      0.1512      0.1867      0.1769      0.7634   
1         653635      37.580     36.4300     44.1000     51.0900     62.6300   
2      102466751      20.910     33.0600     53.4300     68.7500     56.9600   
3      107985730       0.000      0.4643      0.2549      0.7241      0.5023   
4      100302278       0.000      0.0000      0.2484      0.0000      0.0000   
...          ...         ...         ...         ...         ...         ...   
39371       4541    9851.000   3119.0000   4713.0000  11940.0000  14940.0000   
39372       4556    4714.000   1824.0000   2719.0000   6086.0000   8820.0000   
39373       4519   16400.000   3351.0000   9508.0000  16070.0000  16880.0000   
39374       4576     171.000     98.4000    116.9000    414.7000    233.4000   
39375       4571    3875.000    457.9000    473.8000   2481.0000   2248.0000   

       GSM7305249  GSM7305250  GSM73052

  annot_df = pd.read_csv(annot_file, sep="\t")


In [3]:
# Assuming you have the two dataframes: df1 (with GeneID, Symbol, EnsemblGeneID) and df2 (gene expression data)

# Step 1: Create the new 'Symbol (GeneID)' column in df1
annot_df['Symbol (GeneID)'] = annot_df['Symbol'] + ' (' + annot_df['GeneID'].astype(str) + ')'

# Step 2: Merge this new column with df2
# Assuming df2 has GeneID as one of its columns, replace GeneID with Symbol (GeneID)
rnaseq_df = rnaseq_df.set_index('GeneID')  # Set the GeneID column as index in df2
rnaseq_df.index.name = 'Symbol (GeneID)'  # Rename index to match the new column name

# Step 3: Map the Symbol (GeneID) values from df1 to df2 index
rnaseq_df.index = rnaseq_df.index.map(annot_df.set_index('GeneID')['Symbol (GeneID)'])

# Step 4: Transpose the dataframe so that Symbol (GeneID) are the column names and SampleIDs are the rows
rna_transposed = rnaseq_df.T

rna_transposed.columns.name = 'SampleID'

# Now df2_transposed will have Symbol (GeneID) as column names and SampleIDs as rows
print(rna_transposed.head())


SampleID    DDX11L1 (100287102)  WASH7P (653635)  MIR6859-1 (102466751)  \
GSM7305242               0.2690            37.58                  20.91   
GSM7305243               0.1512            36.43                  33.06   
GSM7305244               0.1867            44.10                  53.43   
GSM7305246               0.1769            51.09                  68.75   
GSM7305247               0.7634            62.63                  56.96   

SampleID    MIR1302-2HG (107985730)  MIR1302-2 (100302278)  FAM138A (645520)  \
GSM7305242                   0.0000                 0.0000           0.00000   
GSM7305243                   0.4643                 0.0000           0.00000   
GSM7305244                   0.2549                 0.2484           0.06067   
GSM7305246                   0.7241                 0.0000           0.00000   
GSM7305247                   0.5023                 0.0000           0.00000   

SampleID    OR4F5 (79501)  LOC100996442 (100996442)  LOC729737 (7297

In [4]:
# Z-score normalization
scaler = StandardScaler()
zscored_data = pd.DataFrame(
    scaler.fit_transform(rna_transposed),
    columns=rna_transposed.columns,
    index=rna_transposed.index
)
zscored_data.head()

SampleID,DDX11L1 (100287102),WASH7P (653635),MIR6859-1 (102466751),MIR1302-2HG (107985730),MIR1302-2 (100302278),FAM138A (645520),OR4F5 (79501),LOC100996442 (100996442),LOC729737 (729737),DDX11L17 (102725121),...,ND4 (4538),TRNH (4564),TRNS2 (4575),TRNL2 (4568),ND5 (4540),ND6 (4541),TRNE (4556),CYTB (4519),TRNT (4576),TRNP (4571)
GSM7305242,-0.338761,-0.518032,-0.926804,-1.195744,-0.689966,-0.513794,0.0,-0.792008,-1.114097,-0.042949,...,-0.196229,-0.296894,-0.128626,-0.228259,-0.257261,-0.272589,-0.430288,0.933716,-0.563834,2.094179
GSM7305243,-0.772286,-0.592162,-0.520198,0.227388,-0.689966,-0.513794,0.0,0.230646,-1.043767,-0.59512,...,-1.522927,-0.342298,-0.282273,-0.479933,-1.87791,-1.730727,-1.765064,-2.078147,-0.89592,-1.262044
GSM7305244,-0.64164,-0.09775,0.161495,-0.414447,0.113808,1.540561,0.0,-1.048532,-0.772398,-0.415002,...,-1.000546,-0.671483,-0.717486,-0.697839,-1.234282,-1.38547,-1.351699,-0.657039,-0.811298,-1.246428
GSM7305246,-0.677706,0.352829,0.674187,1.023704,-0.689966,-0.513794,0.0,0.099088,-0.458832,-0.925975,...,1.162957,3.169662,3.145757,1.855222,0.387953,0.179885,0.203383,0.857548,0.550896,0.725014
GSM7305247,1.480722,1.096704,0.279628,0.343862,-0.689966,-0.513794,0.0,-0.190683,-0.395707,1.132289,...,0.588619,-0.047661,0.328641,0.012584,0.178275,0.829679,1.466108,1.044506,-0.278405,0.496165


In [5]:
zscored_data.to_parquet(final_file, index=False)