In [1]:
# import modules
import pandas as pd

## import files
1. gtex colon transverse samples
2. gtex small intestine samples
3. gtex vcf samples

In [2]:
colon = pd.read_csv("/groups/umcg-fg/tmp04/projects/gut-bulk/ongoing/2024-02-07-GutPublicRNASeq/datasets/GTEx/tweaked_files/Colon_Transverse_header.txt", names=["samples"])
colon

Unnamed: 0,samples
0,GTEX-111CU
1,GTEX-111VG
2,GTEX-111YS
3,GTEX-1122O
4,GTEX-1128S
...,...
363,GTEX-ZY6K
364,GTEX-ZYFC
365,GTEX-ZYFG
366,GTEX-ZYVF


In [3]:
small = pd.read_csv("/groups/umcg-fg/tmp04/projects/gut-bulk/ongoing/2024-02-07-GutPublicRNASeq/datasets/GTEx/tweaked_files/Small_Intestine_Terminal_Ileum_header.txt", names=["samples"])
small

Unnamed: 0,samples
0,GTEX-111CU
1,GTEX-111YS
2,GTEX-1122O
3,GTEX-117YX
4,GTEX-1192X
...,...
169,GTEX-ZVT2
170,GTEX-ZVZP
171,GTEX-ZXES
172,GTEX-ZYFG


In [4]:
vcf = pd.read_csv("/groups/umcg-fg/tmp04/projects/gut-bulk/ongoing/2024-02-07-GutPublicRNASeq/datasets/GTEx/tweaked_files/GTEX_VCF_samples.txt", names=["samples"])
vcf

Unnamed: 0,samples
0,GTEX-1A3MV-0002
1,GTEX-1A3MW-0004
2,GTEX-1A3MX-0002
3,GTEX-1A8FM-0004
4,GTEX-1A8G6-0004
...,...
931,GTEX-ZYVF-0004
932,GTEX-ZYY3-0002
933,GTEX-ZZ64-0004
934,GTEX-ZZPT-0002


## Check overlap between tissues

In [5]:
colon["samples"].isin(small.samples).value_counts()

samples
False    237
True     131
Name: count, dtype: int64

In [6]:
small["samples"].isin(colon.samples).value_counts()

samples
True     131
False     43
Name: count, dtype: int64

### Make a new dataframe with unique samples so there is no overlap between tissues, while keeping the most samples possible

In [7]:
small_unique = small[~small["samples"].isin(colon.samples)]
small_unique.size

43

In [8]:
# export the samples we want to extract from the small intestine gtex tissue to a txt file
small_unique.to_csv("/groups/umcg-fg/tmp04/projects/gut-bulk/ongoing/2024-02-07-GutPublicRNASeq/datasets/GTEx/Small_Intestine_Terminal_Ileum_header_unique.txt", header=None, index=False)

In [9]:
combined_unique = pd.merge(small_unique, colon, how="outer", on=["samples"])
combined_unique

Unnamed: 0,samples
0,GTEX-111CU
1,GTEX-111VG
2,GTEX-111YS
3,GTEX-1122O
4,GTEX-1128S
...,...
406,GTEX-ZY6K
407,GTEX-ZYFC
408,GTEX-ZYFG
409,GTEX-ZYVF


In [41]:
# unique non-overlapping samples (gtex small intestine and colon transverse)
# 43 + 368 = 411

## Compare residual expression matrices with the original ones

In [29]:
small_og = pd.read_csv("/groups/umcg-fg/tmp04/projects/gut-bulk/ongoing/2024-02-07-GutPublicRNASeq/datasets/GTEx/tweaked_files/Small_Intestine_Terminal_Ileum.v8.normalized_expression.bed_tweaked", sep="\t")
# Drop the columns '#chr', 'start', and 'end'
small_og = small_og.drop(columns=['#chr', 'start', 'end'])
small_og.head()

Unnamed: 0,gene_id,GTEX-111CU,GTEX-111YS,GTEX-1122O,GTEX-117YX,GTEX-1192X,GTEX-11DXX,GTEX-11DXZ,GTEX-11EQ9,GTEX-11I78,...,GTEX-ZQUD,GTEX-ZTPG,GTEX-ZTSS,GTEX-ZTX8,GTEX-ZVP2,GTEX-ZVT2,GTEX-ZVZP,GTEX-ZXES,GTEX-ZYFG,GTEX-ZZ64
0,ENSG00000227232.5,-1.631157,-0.994458,-0.582842,-2.529314,-0.373775,0.38918,1.631157,-0.634562,-0.050154,...,2.529314,0.801474,0.093238,-0.19459,1.332009,-1.821233,0.549216,-0.90453,-1.298005,1.298005
1,ENSG00000268903.1,0.801474,-0.549216,0.451778,-1.174987,0.297985,0.435972,-1.332009,-1.04258,1.018224,...,0.97124,0.223872,-0.724805,-0.404678,-0.801474,-0.136497,0.312993,-1.204047,-0.90453,-1.265438
2,ENSG00000269981.1,0.312993,-0.420273,0.093238,-0.238583,0.021487,0.516201,-1.298005,-1.174987,0.97124,...,1.531223,0.451778,-0.050154,-0.781879,-0.688038,0.435972,0.549216,-0.467699,-0.549216,-1.405072
3,ENSG00000239906.1,1.093246,-0.781879,1.486515,0.688038,-0.781879,-0.781879,0.209208,-0.781879,0.404678,...,0.050154,-0.781879,0.821382,-0.781879,0.516201,0.724805,0.582842,-0.781879,-0.781879,-0.781879
4,ENSG00000241860.6,0.358459,-1.04258,-0.209208,0.435972,-0.343226,0.328072,-1.444596,-0.565949,0.165473,...,1.146886,-0.762579,0.268166,-0.467699,-0.724805,1.204047,0.253347,0.516201,-0.948535,-0.841621


In [48]:
colon_og = pd.read_csv("/groups/umcg-fg/tmp04/projects/gut-bulk/ongoing/2024-02-07-GutPublicRNASeq/datasets/GTEx/tweaked_files/Colon_Transverse.v8.normalized_expression.bed_tweaked", sep="\t")
# Drop the columns '#chr', 'start', and 'end'
colon_og = colon_og.drop(columns=['#chr', 'start', 'end'])
colon_og.head()

Unnamed: 0,gene_id,GTEX-111CU,GTEX-111VG,GTEX-111YS,GTEX-1122O,GTEX-1128S,GTEX-117YW,GTEX-117YX,GTEX-11DXX,GTEX-11DXZ,...,GTEX-ZV6S,GTEX-ZV7C,GTEX-ZVP2,GTEX-ZVT2,GTEX-ZXES,GTEX-ZY6K,GTEX-ZYFC,GTEX-ZYFG,GTEX-ZYVF,GTEX-ZZ64
0,ENSG00000227232.5,-2.020221,-0.811037,-0.711193,-0.379109,0.889036,-0.972859,-1.327686,0.33566,0.820512,...,-2.403023,1.775104,0.180999,0.601622,-0.719968,1.113293,2.137956,-1.22064,0.445683,-0.16032
1,ENSG00000268903.1,0.522064,-1.113293,-0.711193,1.04051,1.557966,-0.445683,-1.414614,0.342855,-1.295575,...,-0.187908,0.105487,-1.280009,1.064145,0.430727,-0.328482,-1.92523,-1.264747,0.983822,-0.919711
2,ENSG00000269981.1,0.940652,-1.684211,-0.577376,1.433337,0.773842,-0.16032,-0.878995,0.561399,-1.605421,...,-0.401099,0.830061,-0.909391,1.295575,0.321321,-0.292836,-1.178789,-1.63057,0.529864,-0.801635
3,ENSG00000241860.6,-0.626228,1.883589,-0.601622,0.801635,0.65964,-0.529864,1.064145,-0.514295,-0.569369,...,0.3573,0.33566,-0.792303,-1.113293,0.711193,-1.295575,-1.883589,-0.03737,0.820512,-1.844979
4,ENSG00000279457.4,-1.581249,-0.016983,-1.557966,-0.307047,0.292836,-1.344268,-0.792303,0.506556,0.401099,...,-0.869043,-0.506556,-1.04051,-0.3573,-0.003397,-0.229567,-0.37182,-0.423286,-0.642844,-0.030573


In [43]:
colon_corrected = pd.read_csv("/groups/umcg-fg/tmp04/projects/gut-bulk/ongoing/2024-02-07-GutPublicRNASeq/datasets/GTEx/tweaked_files/colon_corrected_exp_residuals.txt.CovariatesRemovedOLS.txt.gz", sep="\t")
colon_corrected.rename(columns={'-':'gene_id'}, inplace=True)

In [44]:
small_corrected = pd.read_csv("/groups/umcg-fg/tmp04/projects/gut-bulk/ongoing/2024-02-07-GutPublicRNASeq/datasets/GTEx/tweaked_files/small_intestine_corrected_exp_residuals.CovariatesRemovedOLS.txt.gz", sep="\t")
small_corrected.rename(columns={'-':'gene_id'}, inplace=True)

### Colon Transverse

In [45]:
colon_corrected.sort_values(by="gene_id").head()

Unnamed: 0,gene_id,GTEX-111CU,GTEX-111VG,GTEX-111YS,GTEX-1122O,GTEX-1128S,GTEX-117YW,GTEX-117YX,GTEX-11DXX,GTEX-11DXZ,...,GTEX-ZV6S,GTEX-ZV7C,GTEX-ZVP2,GTEX-ZVT2,GTEX-ZXES,GTEX-ZY6K,GTEX-ZYFC,GTEX-ZYFG,GTEX-ZYVF,GTEX-ZZ64
17867,ENSG00000000003.14,-0.125585,0.122961,0.190132,-0.217898,-0.051152,0.240522,-0.295547,-0.134806,-0.330283,...,0.028748,0.197875,-0.686771,-0.354474,-0.065292,-0.032659,-0.146246,-0.055387,0.517049,-0.059093
17844,ENSG00000000005.5,0.010117,-0.641715,0.010271,1.058705,-0.669473,0.328847,0.965188,-0.342902,-0.827583,...,0.446197,0.052086,-0.719239,-1.277457,-0.485614,-0.252758,-0.267719,0.750216,0.327439,-0.56186
14871,ENSG00000000419.12,-0.423136,-0.317468,-0.737069,-0.245801,-0.537045,-0.104784,-0.096081,0.108634,0.508258,...,0.159853,0.140508,-0.025376,0.011793,0.241893,-0.351446,0.24916,-0.501162,-0.680401,0.007788
16957,ENSG00000000457.13,-0.041449,-0.16739,-0.0082,-0.052758,0.311891,0.113554,0.285084,-0.026506,-0.271002,...,-0.003157,0.01111,0.056487,-0.304784,0.250279,0.025247,0.076726,0.05252,-0.105658,0.071061
16952,ENSG00000000460.16,-0.054366,0.012443,0.494128,0.692178,-0.029416,0.297539,0.419932,-0.049671,-0.365092,...,0.556855,-0.983542,0.130137,0.236196,0.469045,0.117601,0.483751,-0.432265,-0.812473,-0.016011


In [49]:
colon_og.sort_values(by="gene_id").head()

Unnamed: 0,gene_id,GTEX-111CU,GTEX-111VG,GTEX-111YS,GTEX-1122O,GTEX-1128S,GTEX-117YW,GTEX-117YX,GTEX-11DXX,GTEX-11DXZ,...,GTEX-ZV6S,GTEX-ZV7C,GTEX-ZVP2,GTEX-ZVT2,GTEX-ZXES,GTEX-ZY6K,GTEX-ZYFC,GTEX-ZYFG,GTEX-ZYVF,GTEX-ZZ64
25035,ENSG00000000003.14,0.76471,-0.498848,-1.04051,1.04051,0.408472,0.180999,-1.684211,1.126007,0.569369,...,-1.605421,0.859175,0.208692,0.755641,-0.180999,-0.105487,-1.151997,-1.433337,-1.581249,-0.03737
25034,ENSG00000000005.5,-0.250542,-0.201755,-1.151997,1.656795,-0.243539,-0.023778,0.65964,0.408472,0.49117,...,-0.972859,0.940652,-0.569369,-0.48352,2.020221,-0.236547,-0.878995,0.222598,-2.403023,-0.634514
23454,ENSG00000000419.12,1.883589,-0.438193,-0.58542,-0.229567,-0.453198,0.105487,-0.257558,0.711193,-0.13971,...,-0.243539,0.187908,0.112321,-0.617985,0.972859,0.05097,-0.811037,-0.676623,0.685189,0.236547
1880,ENSG00000000457.13,0.167205,-1.344268,-0.21564,1.808938,-0.016983,-2.780947,-0.153443,0.430727,1.844979,...,-0.071387,1.656795,1.22064,1.192507,-0.569369,0.514295,-0.642844,-0.460738,-1.605421,-0.078199
1875,ENSG00000000460.16,0.222598,-0.553464,0.408472,-0.983822,1.92523,0.668107,0.553464,-1.178789,-0.408472,...,1.113293,-0.899167,0.728799,-0.498848,-0.668107,-0.044169,0.651219,-0.878995,-2.137956,-0.174098


### Small Intestine

In [46]:
small_corrected.sort_values(by="gene_id").head()

Unnamed: 0,gene_id,GTEX-111CU,GTEX-111YS,GTEX-1122O,GTEX-117YX,GTEX-1192X,GTEX-11DXX,GTEX-11DXZ,GTEX-11EQ9,GTEX-11I78,...,GTEX-ZQUD,GTEX-ZTPG,GTEX-ZTSS,GTEX-ZTX8,GTEX-ZVP2,GTEX-ZVT2,GTEX-ZVZP,GTEX-ZXES,GTEX-ZYFG,GTEX-ZZ64
18564,ENSG00000000003.14,-0.222048,-0.530015,-0.33324,-0.678661,-0.083825,-0.080519,-0.132773,0.004012,-0.049991,...,0.344905,0.171122,-0.387922,-0.163382,0.358474,-0.079498,-0.081287,-0.237264,0.438241,-0.125749
18563,ENSG00000000005.5,-0.226785,0.694028,-1.523535,-0.382715,0.024978,-0.776319,0.031286,0.681006,0.720152,...,-0.232511,0.466843,0.516051,-0.358751,-0.197416,-0.096412,-0.323239,-1.273409,0.213163,0.175741
15398,ENSG00000000419.12,0.172891,-0.181444,-0.297375,-0.290683,0.380267,-0.207423,0.189888,0.099097,0.265881,...,-0.535245,-0.099781,0.272663,0.371527,0.184507,0.190113,0.092329,0.179156,0.419439,0.011778
17274,ENSG00000000457.13,0.200255,0.302261,-0.528266,-0.880028,0.492428,-0.526823,0.20481,0.597575,-0.125308,...,-0.119311,-0.3275,0.720262,0.605838,-0.789266,-1.021951,-0.000722,0.267262,-0.234469,0.006142
17269,ENSG00000000460.16,-0.391952,0.287247,-0.588937,-0.477658,0.545524,-0.005307,-0.597606,0.059019,-0.337124,...,-0.081659,0.135501,0.43028,0.017747,-0.394846,0.023105,-0.16175,-0.71598,0.208643,0.105083


In [42]:
small_og.sort_values(by="gene_id").head()

Unnamed: 0,gene_id,GTEX-111CU,GTEX-111YS,GTEX-1122O,GTEX-117YX,GTEX-1192X,GTEX-11DXX,GTEX-11DXZ,GTEX-11EQ9,GTEX-11I78,...,GTEX-ZQUD,GTEX-ZTPG,GTEX-ZTSS,GTEX-ZTX8,GTEX-ZVP2,GTEX-ZVT2,GTEX-ZVZP,GTEX-ZXES,GTEX-ZYFG,GTEX-ZZ64
25833,ENSG00000000003.14,-0.670001,0.435972,0.688038,-1.234161,-0.122053,0.007162,0.107634,0.420273,0.093238,...,-0.253347,0.883173,0.035817,1.57922,0.238583,0.706303,0.801474,1.119664,2.116695,0.312993
25832,ENSG00000000005.5,0.007162,-0.404678,-1.631157,0.165473,0.617139,-1.298005,0.821382,0.634562,0.328072,...,-0.021487,0.926308,0.883173,-0.38918,-1.119664,0.451778,-0.238583,-0.821382,-0.122053,-0.312993
24217,ENSG00000000419.12,1.367628,0.093238,0.078861,0.90453,-0.136497,-0.862211,-1.486515,-0.122053,-0.97124,...,0.821382,-0.948535,0.549216,0.451778,-0.297985,-0.358459,0.283044,-0.150969,1.174987,-0.035817
1948,ENSG00000000457.13,-0.688038,2.275814,0.617139,-1.821233,-1.119664,-0.180012,1.067571,1.367628,-0.064501,...,-1.093246,-0.724805,0.451778,1.405072,-0.483739,-0.549216,0.652179,1.821233,0.180012,0.821382
1943,ENSG00000000460.16,-0.565949,-1.821233,-2.275814,-0.451778,0.283044,-0.268166,-0.532636,-0.107634,-0.926308,...,-0.499904,0.150969,-1.631157,-1.265438,0.404678,0.634562,-1.902216,-0.841621,-0.420273,0.093238


## Remove samples from the small intestine file that are also in the colon tissue file

In [56]:
# Step 1: Get the sample names from df1
sample_names = small_unique['samples'].tolist()

# Step 2: Extract the column names from df2 (the first row, except the first column)
# We can use iloc to extract the first row of df2
df2_sample_names = small_corrected.iloc[0, 1:].tolist()

# Step 3: Create a list of columns that match the sample names
# First column of df2 remains intact, we filter the rest based on matching sample names
columns_to_keep = [small_corrected.columns[0]] + [col for col in small_corrected.columns[1:] if col in sample_names]

# Step 4: Filter df2 to only keep the matching columns
filtered_df2 = small_corrected.loc[:, columns_to_keep]

filtered_df2

Unnamed: 0,gene_id,GTEX-1192X,GTEX-11XUK,GTEX-12WSK,GTEX-13113,GTEX-131XF,GTEX-13NYB,GTEX-13RTK,GTEX-1497J,GTEX-14PJ6,...,GTEX-Z9EW,GTEX-ZA64,GTEX-ZAB5,GTEX-ZC5H,GTEX-ZLFU,GTEX-ZQG8,GTEX-ZQUD,GTEX-ZTPG,GTEX-ZTX8,GTEX-ZVZP
0,ENSG00000245954.6,-0.125589,-0.434996,-0.120499,-0.255766,-0.167354,-0.356928,-0.540676,-0.412876,-0.280577,...,-0.074092,-0.369572,0.870598,-0.049238,-0.302133,0.239150,0.142561,0.259389,-0.179769,-0.071198
1,ENSG00000270750.1,0.637589,0.728895,-1.231306,2.029209,-0.186725,0.774697,1.783774,-1.068160,-0.712089,...,-0.530440,0.414253,-0.578670,0.141690,1.494270,0.495373,0.343417,-0.361793,-0.959140,-0.346925
2,ENSG00000109670.13,-1.027574,0.639308,0.729027,0.059876,-0.015989,-0.360794,-0.330998,-0.257798,0.323822,...,-0.003581,-0.653694,-0.651214,0.002663,-0.482935,-0.284695,-0.101072,-0.435743,0.247530,-0.365990
3,ENSG00000268471.6,-0.356881,-0.136848,0.164374,-0.281317,-0.835043,0.309747,0.036072,0.149183,0.173107,...,0.005122,0.115118,0.166367,0.165085,0.358021,-0.035727,0.263186,0.072397,-0.076848,0.069580
4,ENSG00000243417.1,0.377663,-0.350221,-0.556788,0.171514,-0.614380,-0.815876,0.094585,-0.467251,-0.093810,...,0.163700,-0.761313,0.044427,0.025507,-0.844268,0.159377,-0.397117,0.537803,0.640848,0.252688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26177,ENSG00000251611.1,0.058404,0.115611,0.817750,0.842124,0.127975,0.273064,-0.165817,-0.641125,-0.104215,...,0.234711,0.242675,-0.270324,0.219554,-0.582996,-0.036818,-0.501102,-1.335711,0.687322,0.123651
26178,ENSG00000164142.15,-0.367918,-0.147131,0.270928,0.038802,-0.117554,0.286313,-0.142399,0.076303,-0.053587,...,0.364027,-0.252661,0.304716,0.120803,-0.199440,-0.017246,0.285987,-0.018387,0.257640,-0.319242
26179,ENSG00000278978.1,0.202110,0.208428,0.094813,-0.231357,0.265683,-0.069300,-0.044727,0.229672,-0.379584,...,0.263651,0.034889,0.669614,-0.315791,-0.001337,0.239919,0.297538,-0.189878,0.010994,0.379460
26180,ENSG00000251455.1,0.236351,0.539507,0.031255,-0.511627,0.768866,0.071690,-1.042532,-1.344155,0.317925,...,-0.233419,-0.569667,-0.501246,0.440835,0.609497,0.140270,0.363186,0.319092,1.142244,-1.367860


In [57]:
colon_small_merged_exp = pd.merge(colon_corrected, filtered_df2, how="outer", on="gene_id")
colon_small_merged_exp

Unnamed: 0,gene_id,GTEX-111CU,GTEX-111VG,GTEX-111YS,GTEX-1122O,GTEX-1128S,GTEX-117YW,GTEX-117YX,GTEX-11DXX,GTEX-11DXZ,...,GTEX-Z9EW,GTEX-ZA64,GTEX-ZAB5,GTEX-ZC5H,GTEX-ZLFU,GTEX-ZQG8,GTEX-ZQUD,GTEX-ZTPG,GTEX-ZTX8,GTEX-ZVZP
0,ENSG00000000003.14,-0.125585,0.122961,0.190132,-0.217898,-0.051152,0.240522,-0.295547,-0.134806,-0.330283,...,0.107824,0.135774,-0.098130,-0.470172,0.068491,-0.693091,0.344905,0.171122,-0.163382,-0.081287
1,ENSG00000000005.5,0.010117,-0.641715,0.010271,1.058705,-0.669473,0.328847,0.965188,-0.342902,-0.827583,...,-0.320053,-0.289542,-1.122744,-0.567251,0.462084,-0.011898,-0.232511,0.466843,-0.358751,-0.323239
2,ENSG00000000419.12,-0.423136,-0.317468,-0.737069,-0.245801,-0.537045,-0.104784,-0.096081,0.108634,0.508258,...,0.551694,0.452381,0.297868,0.853599,0.320763,0.150596,-0.535245,-0.099781,0.371527,0.092329
3,ENSG00000000457.13,-0.041449,-0.167390,-0.008200,-0.052758,0.311891,0.113554,0.285084,-0.026506,-0.271002,...,0.907654,0.009661,0.068730,-1.176399,-0.058382,-0.169158,-0.119311,-0.327500,0.605838,-0.000722
4,ENSG00000000460.16,-0.054366,0.012443,0.494128,0.692178,-0.029416,0.297539,0.419932,-0.049671,-0.365092,...,0.030635,-0.132023,0.089073,-0.341451,0.368107,-0.085542,-0.081659,0.135501,0.017747,-0.161750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26716,ENSG00000284471.1,1.129968,0.027716,0.700809,-0.197417,-0.283396,0.258407,0.478468,-0.536028,0.132640,...,,,,,,,,,,
26717,ENSG00000284523.1,,,,,,,,,,...,-0.257682,0.072198,0.083362,-0.018611,-0.277701,-0.176770,0.655295,1.226840,0.275557,-0.463309
26718,ENSG00000284526.1,0.102996,0.128432,-0.281760,0.519027,0.334135,0.126460,0.316843,-0.458153,0.310750,...,-0.745528,0.524310,-0.055921,-0.235817,0.228225,0.237873,0.167978,0.088965,0.030604,0.829662
26719,ENSG00000284543.1,0.730508,0.055223,0.494915,0.175964,-0.070724,0.132223,0.281417,0.483452,0.022299,...,,,,,,,,,,


In [62]:
colon_small_merged_exp.to_csv("/groups/umcg-fg/tmp04/projects/gut-bulk/ongoing/2024-02-07-GutPublicRNASeq/datasets/GTEx/tweaked_files/combined_corrected_exp.txt", index=False, sep="\t")

## Create the linkfile for mbQTL

In [65]:
linkfile = pd.read_csv("/groups/umcg-fg/tmp04/projects/gut-bulk/ongoing/2024-02-07-GutPublicRNASeq/datasets/GTEx/tweaked_files/linkfile_gtex.txt", names=["exp_samples"])
linkfile

Unnamed: 0,exp_samples
0,GTEX-111CU
1,GTEX-111VG
2,GTEX-111YS
3,GTEX-1122O
4,GTEX-1128S
...,...
406,GTEX-ZQG8
407,GTEX-ZQUD
408,GTEX-ZTPG
409,GTEX-ZTX8


In [66]:
gtex_vcf_samples = pd.read_csv("/groups/umcg-fg/tmp04/projects/gut-bulk/ongoing/2024-02-07-GutPublicRNASeq/datasets/GTEx/tweaked_files/GTEX_VCF_samples.txt", names=["vcf_samples"])
gtex_vcf_samples

Unnamed: 0,vcf_samples
0,GTEX-1A3MV-0002
1,GTEX-1A3MW-0004
2,GTEX-1A3MX-0002
3,GTEX-1A8FM-0004
4,GTEX-1A8G6-0004
...,...
931,GTEX-ZYVF-0004
932,GTEX-ZYY3-0002
933,GTEX-ZZ64-0004
934,GTEX-ZZPT-0002


In [83]:
# Create empty lists to store the matching rows and the missing substrings
result_list = []
missing_substrings = []

# Loop over each substring in linkfile["exp_samples"]
for substring in linkfile["exp_samples"]:
    # Find rows in gtex_vcf_samples where the substring is present
    matching_rows = gtex_vcf_samples[gtex_vcf_samples["vcf_samples"].str.contains(substring, na=False)]
    
    # If no matching rows are found, add the substring to missing_substrings
    if matching_rows.empty:
        missing_substrings.append(substring)
    else:
        # Determine the source for each substring based on whether it is in small_unique
        if small_unique["samples"].str.contains(substring, na=False).any():
            source = "GTEx_Small_Intestine"
        else:
            source = "GTEx_Colon_Transverse"
        
        # For each matching row, append a new row with the required three columns
        for index, row in matching_rows.iterrows():
            result_list.append([row["vcf_samples"], substring, source])

# Convert the list of results into a new DataFrame
result_df = pd.DataFrame(result_list, columns=["vcf_samples", "exp_sample", "source"])

# Convert the list of missing substrings into a new DataFrame
missing_df = pd.DataFrame(missing_substrings, columns=["exp_sample"])

# Show the resulting DataFrames
print("Result DataFrame with found matches:")
print(result_df)

print("\nMissing Substrings DataFrame:")
print(missing_df)


Result DataFrame with found matches:
         vcf_samples  exp_sample                 source
0    GTEX-111CU-0003  GTEX-111CU  GTEx_Colon_Transverse
1    GTEX-111VG-0004  GTEX-111VG  GTEx_Colon_Transverse
2    GTEX-111YS-0004  GTEX-111YS  GTEx_Colon_Transverse
3    GTEX-1122O-0004  GTEX-1122O  GTEx_Colon_Transverse
4    GTEX-1128S-0001  GTEX-1128S  GTEx_Colon_Transverse
..               ...         ...                    ...
401   GTEX-ZQG8-0003   GTEX-ZQG8   GTEx_Small_Intestine
402   GTEX-ZQUD-0002   GTEX-ZQUD   GTEx_Small_Intestine
403   GTEX-ZTPG-0003   GTEX-ZTPG   GTEx_Small_Intestine
404   GTEX-ZTX8-0004   GTEX-ZTX8   GTEx_Small_Intestine
405   GTEX-ZVZP-0003   GTEX-ZVZP   GTEx_Small_Intestine

[406 rows x 3 columns]

Missing Substrings DataFrame:
   exp_sample
0  GTEX-117YX
1  GTEX-1C4CL
2  GTEX-1CAMS
3  GTEX-1LVAN
4   GTEX-QLQ7


Why am are these 5 samples not found as substrings in the vcf header?

In [84]:
result_df.source.value_counts()

source
GTEx_Colon_Transverse    363
GTEx_Small_Intestine      43
Name: count, dtype: int64

In [85]:
result_df.to_csv("/groups/umcg-fg/tmp04/projects/gut-bulk/ongoing/2024-02-07-GutPublicRNASeq/datasets/GTEx/tweaked_files/linkfile_gtex_final.txt", index=False, header=False, sep="\t")