# Data Preparation

---

We retrieve the data from the Research Data Storage, which is a Network Drive embedded on my local computer as the Z drive.

In [None]:
!python -m pip install pandas tqdm --quiet

We use the `TCRseq_filename.csv` file to locate the desired file.  We are looking for the following:
- Lung Cancer Pre-Surgery
- Blood Cancer Pre-Surgery

In [None]:
import pandas as pd

files = pd.read_csv("data/TCRseq_filenames.csv")
lungs_files = files[(files["organ"] == "LN") & (files["timepoint"] == "SU")]
blood_files = files[(files["organ"] == "PBMC") & (files["timepoint"] == "SU")]
files.head()

We then load the files from the data storage to the local drive.

In [None]:
import shutil, os
from tqdm.notebook import tqdm

destination_address = "data/archive/"
original_address = "Z:/TRACERx_TCRseq_Data_20221015/DATA/TSV/"

def relocate(files_df, suffix):
    try:
        os.makedirs(destination_address + suffix + "/")
        for ltxid, filename in tqdm(files_df[["LTX_ID", "filename"]].values.tolist(), desc = suffix.title()):
            addr = original_address + ltxid + "/" + filename
            shutil.copy2(addr, destination_address + suffix + "/")
    except FileExistsError:
        pass

relocate(lungs_files, "lung")
relocate(blood_files, "pbmc")

In [None]:
import gzip, glob

def decompress(extract_from, extract_to):
    try:
        os.makedirs(extract_to)
        for fname in tqdm(list(glob.glob(extract_from + "*.gz"))):
            fname = fname.replace("\\", "/")
            with gzip.open(fname, "rt") as f:
                file_content = f.read()
            f = open(extract_to + fname.split("/")[-1].replace(".gz", ""), "w")
            f.write(file_content)
            f.close()
    except FileExistsError:
        pass

decompress("data/archive/lung/", "data/files/lung/")
decompress("data/archive/pbmc/", "data/files/pbmc/")

In [None]:
def combine(org, dest, out_filename):
    try:
        os.makedirs(dest)
        df = pd.DataFrame()
        for fname in tqdm(list(glob.glob(org + "*.tsv")), desc = out_filename):
            df_ = pd.read_csv(fname.replace("\\", "/"), delimiter = "\t")
            df_["sequence_id"] = fname.split("/")[-1].replace(".tsv", "") + "_" + df_["sequence_id"].astype(str)
            df_.set_index("sequence_id", inplace = True)
            df = pd.concat([df, df_], axis = 0)
        df.to_csv(dest + out_filename + ".csv")
    except FileExistsError:
        pass

combine("data/files/lung/", "data/combined/", "lung")
combine("data/files/pbmc/", "data/combined/", "pbmc")