# Pre-process the data used for labelling TCGA data
The TCGA data is labelled using 4 different databases:
1. [CIViC](https://civicdb.org/home)
2. [Martelotto *et al.*](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-014-0484-1)
3. [CGC](https://cancer.sanger.ac.uk/census)
4. [Bailey *et al.*](https://doi.org/10.1016/j.cell.2018.02.060)

The first two methods of labelling uses variant position and nucleotide change to label mutations.
The other two methods label genes.

In [1]:
# Import required packages
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import re
import random
import numpy as np

Set the path to save output files (PATH) and folder with data files (DATAPATH)

In [4]:
PATH = "D:/Projects/IdentificationOfTSG-OG/cTaG_2"
DATAPATH = "D:/Projects/IdentificationOfTSG-OG/cTaG_2/data"
# PATH = "D:/Projects/cTaG2.0"
# DATAPATH = "D:/Projects/cTaG2.0/data"
# PATH = "/data/malvika/cTaG2.0"
# DATAPATH = "/data/malvika/cTaG2.0/data"

## CIViC data

In [6]:
os.chdir(DATAPATH + "/driver genes/CIViC/")
fname = "civic_filtered.txt"
data_civic = pd.read_csv(fname, sep="\t", header=0)

Create column with mutation location

In [7]:
data_civic["Mut_loc"] = ["chr{}:{:.0f}-{:.0f}|{}|{}".format(chrm, start, end, wt, mt) for chrm, start, end, wt, mt in zip(data_civic.Chr, data_civic.Start38, data_civic.End38, data_civic.reference_bases, data_civic.variant_bases)]

Save file

In [8]:
os.chdir(DATAPATH + "/driver genes/CIViC/")
fname = "civic_filtered.txt"
data_civic.to_csv(fname, sep="\t", header=True, index=False)

## Martelotto et al dataset

In [9]:
os.chdir(DATAPATH + "/driver genes/Martelotto et al/")
fname = "martelotto_filter.txt"
data_martelotto = pd.read_csv(fname, sep="\t", header=0)

Change mutation location from hg19 to GRCh38 using LiftOver
Generate LO input file

In [10]:
mar_lo_ip = ["chr{}:{:.0f}-{:.0f}".format(chrm, start, int(start)+int(end)-1) for chrm, start, end in zip(data_martelotto.Chromosome, data_martelotto.Position, data_martelotto['Number of nucleotide changes'])]
os.chdir(DATAPATH + "/driver genes/Martelotto et al/")
fname = "mar_loip.txt"
pd.Series(mar_lo_ip).to_csv(fname, index=False)

  after removing the cwd from sys.path.


Load LO output

In [11]:
os.chdir(DATAPATH + "/driver genes/Martelotto et al/")
fname="hglft_genome_23b38_542bd0.bed"
mar_loc = pd.read_csv(fname, header=None)


Create column with mutation location

In [12]:
temp = [re.search("^chr(.+):(\d+)-(\d+)$", x).group(1) for x in mar_loc[0]]
data_martelotto["Chromosome_38"] = temp
temp = [re.search("^chr(.+):(\d+)-(\d+)$", x).group(2) for x in mar_loc[0]]
data_martelotto["Start_38"] = temp
temp = [re.search("^chr(.+):(\d+)-(\d+)$", x).group(3) for x in mar_loc[0]]
data_martelotto["End_38"] = temp
data_martelotto["Mut_loc"] = ["chr{}:{}-{}|{}|{}".format(chrm, start, end, wt, mt) for chrm, start, end, wt, mt in zip(data_martelotto.Chromosome_38, data_martelotto.Start_38, data_martelotto.End_38, data_martelotto['Reference allele'], data_martelotto['Alternate allele'])]

Save file

In [13]:
os.chdir(DATAPATH + "/driver genes/Martelotto et al/")
fname = "martelotto_final.txt"
data_martelotto.to_csv(fname, index=False, sep="\t", header=True)

## CMC

In [None]:
#fname = "D:/Projects/IdentificationOfTSG-OG/cTaG_2/data/driver genes/COSMIC/cmc_export.tsv"
#data_cmc = pd.read_csv(fname, sep="\t", header=0, low_memory=False)
