#### Prepare GSE164522 Data for Downstream Analysis
This notebook prepares the original TCR sequencing data for all downstream analysis tasks in the project.

#### Set Environment

In [None]:
import pandas as pd
import collections

#### Load Original Data

In [None]:
# Load original data
T_cell_GSE164522 = pd.read_csv('~/CODON_CVC/single_cell_data/GSE164522_colon_data/GSE164522_vdj_final.csv', index_col=0)
T_cell_GSE164522

Initially, each row had both the alpha and beta sequences. Below we split the rows so that each row will contain 1 sequence, like the formats of the other datasets.

In [None]:
# Splitting the DataFrame into two, one for '_a' and one for '_b'
T_cell_GSE164522_a = T_cell_GSE164522[['barcode', 'Patient_a', 'cdr3_a', 'cdr3_nt_a', 'v_gene_a', 'j_gene_a', 'TISSUE_a']].copy()
T_cell_GSE164522_b = T_cell_GSE164522[['barcode', 'Patient_b', 'cdr3_b', 'cdr3_nt_b', 'v_gene_b', 'j_gene_b', 'TISSUE_b']].copy()

# Renaming columns to have a unified structure
T_cell_GSE164522_a.columns = ['barcode', 'patient', 'cdr3_aa', 'cdr3_nt', 'v_gene', 'j_gene', 'tissue']
T_cell_GSE164522_b.columns = ['barcode', 'patient', 'cdr3_aa', 'cdr3_nt', 'v_gene', 'j_gene', 'tissue']

# Concatenating the DataFrames to stack them vertically
T_cell_GSE164522_subset = pd.concat([T_cell_GSE164522_a, T_cell_GSE164522_b], ignore_index=True)

# edit tissue column: MN1 and MN2 to MN , MT1 and MT2 to MT
T_cell_GSE164522_subset['tissue'] = T_cell_GSE164522_subset['tissue'].replace(['MN1', 'MN2'], 'MN')
T_cell_GSE164522_subset['tissue'] = T_cell_GSE164522_subset['tissue'].replace(['MT1', 'MT2'], 'MT')

In [None]:
# drop duplicates
T_cell_GSE164522_subset = T_cell_GSE164522_subset.drop_duplicates()
collections.Counter(T_cell_GSE164522_subset['tissue'])

##### Subset data for a specific tissue

In [None]:
# subset only PBMC, MT and PT tissue
T_cell_GSE164522_subset_PBMC_Tumor = T_cell_GSE164522_subset[T_cell_GSE164522_subset['tissue'].isin(['PBMC', 'PT'])]
T_cell_GSE164522_subset_PBMC_Tumor

#### Export Data

In [None]:
DATA_TO_EXPORT_SPECIFIC = T_cell_GSE164522_subset_PBMC_Tumor
DATA_TO_EXPORT_ALL = T_cell_GSE164522_subset

# export to csv
DATA_TO_EXPORT_SPECIFIC.to_csv('~/CODON_CVC/single_cell_data/GSE164522_colon_data/GSE164522_subset_PBMC_PT.csv')
DATA_TO_EXPORT_ALL.to_csv('~/CODON_CVC/single_cell_data/GSE164522_colon_data/GSE164522_all_labels.csv')