In [None]:
import pandas as pd
import numpy as np

In [134]:
df = pd.read_csv('../data/TCGA-TARGET-GTEx/mRNA_StemScore.extended.tsv', 
                 sep='\t', header=0, index_col=0)
df = df[df['cohort'] == 'GTEX']
df['SUBJID'] = df.index.str.rsplit('-', 3).str[0]

genotype_ids = pd.read_csv('../data/TCGA-TARGET-GTEx/GTEx_genotype_subjIDs.txt', header=None, squeeze=True).values
ancestry = pd.read_csv('../data/GTEx/GTEx.v7.ancestry.txt', sep='\t', header=0, index_col=0)
ancestry.index = ancestry.index.str.replace('_', '-')
white_ids = ancestry[ancestry['ancestry'] == 0].index
cov_ids = pd.read_csv('../data/TCGA-TARGET-GTEx/GTEx-mutli-tissue/01_ori_input/cov.txt', sep='\t', header=None, index_col=0).index
ids = np.intersect1d(cov_ids, np.intersect1d(genotype_ids, white_ids))
df = df.loc[df['body_site'].isin(['Uterus', 'Breast', 'Thyroid', 'Lung', 'Esophagus', 'Stomach', 'Colon', 'Prostate', 'Liver']), ]
df = df.loc[df['SUBJID'].isin(ids), ]
df['tissue'] = df['tissue'].str.replace(' ', '_')
df = df.sort_index()

# 1. Tissue information file: this file contains information on which tissues are collected from each sample. Here is one such file (tissue_info.txt)

1. First line always starts with "#TISSUE" followed by tissue names (e.g. "cortex," "heart," "liver," and "spleen")

2. From the second line, information on which tissues are collected from each sample is specified

   2.1 First column is sample ID (e.g. "a" "b" "c" ...)

   2.2. Second and later column is 1 if the tissue was collected from this individual or 0 otherwise. For example, cortex, heart, and liver were collected from sample "a" while liver and spleen were collected from sample "b"

3. IMPORTANT!!! Columns must be separated by a whitespace (space or tab), but multiple whitespaces are NOT allowed and will cause errors.


In [135]:
tissue_info = pd.DataFrame(0, index=df['SUBJID'].unique(), columns=df['tissue'].unique())
tissue_info.index.name = '#TISSUE'
tissue_info = tissue_info.sort_index()

In [136]:
tissue_list = df['tissue'].unique()
for tissue in tissue_list:
    tissue_info.loc[df[df['tissue'] == tissue]['SUBJID'], tissue] = 1
    

In [131]:
tissue_info.to_csv('/home/omics/DATA1/haeun/stemness/data/TCGA-TARGET-GTEx/GTEx-mutli-tissue/01_ori_input/tissue_info.txt', sep='\t')

# 2. Gene expression data: there are 3 different types of files specifying information on gene expression data.

## 2.1. Gene expression file: This specifies gene expression level for each sample measured on multiple probes. Meta-Tissue expects one gene expression file for each tissue. So, if there are 4 tissues collected, then there must be 4 files. The format of each gene expression file is as follows (cortex.txt):

1. First line lists sample IDs collected for this tissue.

2. Please note that samples that are not collected for this tissue must not be listed in gene expression file. For example, sample "b" is not listed in this file (cortex.txt) since cortex tissue was not collected from sample "b" (see above tissue information file).

3. IMPORTANT!!! The order of sample IDs must match the order of sample IDs in the tissue information file. For example, "a d e c" in the first line of gene expression file is incorrect because "c" comes before "d" in the tissue information file.

4. Second line and later specify gene expression level for each sample. Each row is a probe for gene expression and each column corresponds to a sample.

In [None]:
grouped = df.groupby('tissue')
for name, group in grouped:
    tmp_df = group.drop_duplicates('SUBJID')
    tmp_df = tmp_df.set_index('SUBJID')['mRNAsi']
    
    tmp_df.T.to_csv('../data/TCGA-TARGET-GTEx/GTEx-mutli-tissue/01_ori_input/{0}.txt'.format(name), sep='\t', index=False)
    #if name == 'Thyroid':
    #    print tmp_df.shape

## 2.2 Gene expression list file: This specifies where the above gene expression files are stored. Specifically, it lists the full path to the above files. Here is one example (gene_list.txt)

1. Each line lists the full path to the gene expression file.

2. IMPORTANT!!! The order of gene expression files in this file must be the same as the order of tissues listed in the first line of tissue information file (tissue_info.txt). In the above example, the order of tissues was "cortex heart liver spleen," and the order of expression files listed here is also "cortex heart liver spleen."

3. IMPORTANT!!! Always use the full or absolute path (e.g. "/usr/home/[user_id]/") when you specify files in Meta-Tissue. Relative path (e.g. "../") or tilde ("~") may cause errors.


1. Each line lists information on each probe.

2. The first column must be probe ID.

3. The second column must be "chr[chr_number]" where chr_number can be 1-22, X, Y, and XY.

4. The third column must be position of probes (starting position).

In [116]:
cov = pd.read_csv('../data/TCGA-TARGET-GTEx/GTEx-mutli-tissue/01_ori_input/cov.txt', sep='\t', header=None, index_col=0)

In [133]:
cov.loc[np.intersect1d(tissue_info.index, cov.index), :].sort_index().to_csv('../data/TCGA-TARGET-GTEx/GTEx-mutli-tissue/01_ori_input/cov.txt', sep='\t', header=False)

# SPLIT chormosome

In [151]:
for i in range(1, 23):
    with open('GTEx-multi-tissue/par.PED.EIGENSTRAT_chr{0}'.format(i), 'w') as out_f:
        with open("GTEx-multi-tissue/par.PED.EIGENSTRAT", 'r') as f:
            for line in f.readlines():
                try:
                    tokens = line.rsplit('.', 1)
                    out_f.write("{0}_chr{1}.{2}".format(tokens[0], i, tokens[1]))
                except IndexError:
                    out_f.write(line)