In [1]:
import os
import pandas as pd

# Define the directory where the folders are located
directory_path = 'gdc_TCGA_BRCA_raw_data'

# Initialize a DataFrame to hold the gene information (assuming it's consistent across all files)
gene_info = pd.DataFrame()

# Initialize a dictionary to hold the TPM data
tpm_data = {}

# Loop through each folder in the directory
for folder_name in os.listdir(directory_path):
    folder_path = os.path.join(directory_path, folder_name)
    
    # Check if it's a folder
    if os.path.isdir(folder_path):
        # In each folder, find the TSV file (assuming there's only one TSV file per folder)
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.tsv'):
                file_path = os.path.join(folder_path, file_name)
                
                # Read the TSV file into a DataFrame, with headers starting from the second row
                data = pd.read_csv(file_path, sep='\t', header=1)
                
                # Check if the 'tpm_unstranded' column exists
                if 'tpm_unstranded' in data.columns:
                    # If gene_info is empty, initialize it with gene_id, gene_name, and gene_type
                    if gene_info.empty:
                        gene_info = data[['gene_id', 'gene_name', 'gene_type']].set_index('gene_id')
                    
                    # Add the 'tpm_unstranded' values to the tpm_data dictionary
                    tpm_data[folder_name] = data.set_index('gene_id')['tpm_unstranded']
                else:
                    print(f"Column 'tpm_unstranded' not found in {file_name}")
                break  # Stop looking if we've found and processed the TSV

# Convert the TPM data dictionary to a DataFrame
tpm_df = pd.DataFrame(tpm_data)

# Combine the gene information with the TPM data
combined_df = gene_info.join(tpm_df)


combined_df

Unnamed: 0_level_0,gene_name,gene_type,2c3000b7-4db9-4f00-a82a-ca6802806631,519b92e0-0dbc-471c-988f-48ce9dc95be3,c9230a1c-9844-43ef-a2c3-95ea6061000b,f2bf29d2-4c33-4810-90e8-88f37ea47bc8,59858555-bc6a-4286-8280-0f8341123cac,de6f1503-33d7-4f86-b835-bdffba7ea4e3,f0a63361-78bb-4a7f-9d78-31f7b2980ba2,121a48ed-798d-4102-bb8f-e736570e18d6,...,8ebe0bf6-11fa-418d-918c-5c73f0e7e9ac,d889e5ea-58f5-4d2f-ad1e-6ab5d3217642,39b28d5e-fc51-4dfc-bcbe-0285c3bad830,08837ae7-6f4f-4aa1-8722-7c404b66ed75,4c941474-f3e0-4735-bad0-cff8894fb08d,5dee4a6f-9cf4-4360-a37b-e7b94aa70dac,86d52d54-1b0d-40b1-99ce-82b4e345acb8,451ecdf8-433f-4d38-8ab6-3cc8d428ed95,67347e29-6d43-4234-8b04-188be7e6440b,055f6138-ea7a-468d-90c2-87a473486867
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N_unmapped,,,,,,,,,,,...,,,,,,,,,,
N_multimapping,,,,,,,,,,,...,,,,,,,,,,
N_noFeature,,,,,,,,,,,...,,,,,,,,,,
N_ambiguous,,,,,,,,,,,...,,,,,,,,,,
ENSG00000000003.15,TSPAN6,protein_coding,42.3897,12.6537,41.4571,33.4127,120.6754,13.2474,14.5023,59.4883,...,63.4952,17.2950,39.9454,71.2799,39.9261,26.0336,7.0782,58.8731,74.1058,153.7924
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000288669.1,AC008763.4,protein_coding,0.0000,0.0000,0.0143,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
ENSG00000288670.1,AL592295.6,lncRNA,11.8231,14.0830,17.6123,28.9473,24.3959,5.6378,10.1150,12.3362,...,15.1854,24.0037,7.5186,13.8258,21.7852,13.9445,11.1585,10.6521,12.6482,13.0155
ENSG00000288671.1,AC006486.3,protein_coding,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
ENSG00000288674.1,AL391628.1,protein_coding,0.0607,0.0116,0.0533,0.0319,0.0680,0.0430,0.0459,0.1270,...,0.0393,0.0051,0.0142,0.0424,0.0629,0.0322,0.0438,0.0616,0.0242,0.0162


In [2]:
# Remove first four rows from combined_df
combined_df = combined_df.iloc[4:]
combined_df

Unnamed: 0_level_0,gene_name,gene_type,2c3000b7-4db9-4f00-a82a-ca6802806631,519b92e0-0dbc-471c-988f-48ce9dc95be3,c9230a1c-9844-43ef-a2c3-95ea6061000b,f2bf29d2-4c33-4810-90e8-88f37ea47bc8,59858555-bc6a-4286-8280-0f8341123cac,de6f1503-33d7-4f86-b835-bdffba7ea4e3,f0a63361-78bb-4a7f-9d78-31f7b2980ba2,121a48ed-798d-4102-bb8f-e736570e18d6,...,8ebe0bf6-11fa-418d-918c-5c73f0e7e9ac,d889e5ea-58f5-4d2f-ad1e-6ab5d3217642,39b28d5e-fc51-4dfc-bcbe-0285c3bad830,08837ae7-6f4f-4aa1-8722-7c404b66ed75,4c941474-f3e0-4735-bad0-cff8894fb08d,5dee4a6f-9cf4-4360-a37b-e7b94aa70dac,86d52d54-1b0d-40b1-99ce-82b4e345acb8,451ecdf8-433f-4d38-8ab6-3cc8d428ed95,67347e29-6d43-4234-8b04-188be7e6440b,055f6138-ea7a-468d-90c2-87a473486867
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003.15,TSPAN6,protein_coding,42.3897,12.6537,41.4571,33.4127,120.6754,13.2474,14.5023,59.4883,...,63.4952,17.2950,39.9454,71.2799,39.9261,26.0336,7.0782,58.8731,74.1058,153.7924
ENSG00000000005.6,TNMD,protein_coding,0.3958,0.1506,382.4071,0.5883,0.3628,3.1532,0.0000,11.1718,...,9.0534,1.6400,2.4515,0.4735,9.3496,1.6442,0.0000,2.7286,0.0000,0.3706
ENSG00000000419.13,DPM1,protein_coding,344.2597,103.1169,53.1594,235.6451,206.3235,72.9635,112.4400,92.4268,...,119.5885,168.8744,84.3919,198.4460,96.4317,111.9534,113.4449,105.3500,164.8172,82.4091
ENSG00000000457.14,SCYL3,protein_coding,18.7206,13.7395,6.2119,25.6634,17.7533,10.5259,23.0657,25.1548,...,12.8634,10.3641,19.4906,15.2549,23.4602,22.8729,41.6654,25.1427,7.3300,15.3934
ENSG00000000460.17,C1orf112,protein_coding,24.0859,5.4633,1.7738,17.6262,43.1789,7.5880,7.3508,9.0705,...,4.9130,5.8835,7.2503,31.2738,7.7083,9.8685,15.8987,3.6309,8.3078,3.7825
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000288669.1,AC008763.4,protein_coding,0.0000,0.0000,0.0143,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
ENSG00000288670.1,AL592295.6,lncRNA,11.8231,14.0830,17.6123,28.9473,24.3959,5.6378,10.1150,12.3362,...,15.1854,24.0037,7.5186,13.8258,21.7852,13.9445,11.1585,10.6521,12.6482,13.0155
ENSG00000288671.1,AC006486.3,protein_coding,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
ENSG00000288674.1,AL391628.1,protein_coding,0.0607,0.0116,0.0533,0.0319,0.0680,0.0430,0.0459,0.1270,...,0.0393,0.0051,0.0142,0.0424,0.0629,0.0322,0.0438,0.0616,0.0242,0.0162


In [5]:
#save to csv file
combined_df.to_csv('combined_tpm_unstranded.csv', sep=',', index=True)
