#### Zhang Data Preparation
This notebook prepares the Zhang dataset to be used for all downstream analysis. 

#### Set Environment

In [None]:
import pandas as pd

#### Load Data

In [None]:
zhang_data = pd.read_csv("~/CRC_Project/single_cell_data/Zhang_TCR_CRC_with_TissueType.csv")
# make row 0 the column names
zhang_data.columns = zhang_data.iloc[0]
# remove row 0
zhang_data = zhang_data[1:10806]

#### Prepare Data

In [None]:
# add column tissue and get the information from Cell type column: 
# PTC	PBMC
# TTC	PT
# NTC	PN
# PTH	PBMC
# TTH	PT
# NTH	PN
# PTR	PBMC
# TTR	PT
# NTR	PN
# PTY	PBMC
# TTY	PT
# NTY	PN
# PP7	PBMC
# TP7	PT
# NP7	PN

tissue_list = []
for i in zhang_data["Cell typea"]:
    if i == "PTC" or i == "PTH" or i == "PTR" or i == "PTY" or i == "PP7":
        tissue_list.append("PBMC")
    elif i == "TTC" or i == "TTH" or i == "TTR" or i == "TTY" or i == "TP7":
        tissue_list.append("PT")
    elif i == "NTC" or i == "NTH" or i == "NTR" or i == "NTY" or i == "NP7":
        tissue_list.append("PN")
    else:
        print("error")

zhang_data["tissue"] = tissue_list
zhang_data
        

In [None]:
# Creating a new DataFrame to contain TRA and TRB sequences in different rows, under cdr3_aa column
zhang_alpha = zhang_data.rename(columns={'CDR3 (Alpha1)': 'cdr3_aa'})
zhang_alpha['chain'] = 'TRA'
zhang_beta = zhang_data.rename(columns={'CDR3 (Beta1)': 'cdr3_aa'})
zhang_beta['chain'] = 'TRB'

# Concatenating both dataframes to double the rows
zhang_data_combined = pd.concat([zhang_alpha, zhang_beta], ignore_index=True)
# drop na in CDR3 column
zhang_data_combined.dropna(subset=['cdr3_aa'], inplace=True)
zhang_data_combined

In [None]:
# remove CDR3 sequences that starts with Couldn't
zhang_data_combined = zhang_data_combined[~zhang_data_combined['cdr3_aa'].str.startswith("Couldn't")]

In [None]:
# Count unique patients for each cdr3_aa sequence
cdr3_aa_publicity = zhang_data_combined.groupby('cdr3_aa')['Patient'].nunique().reset_index(name='unique_patients')

# Determine if the cdr3_aa sequence is public or private
cdr3_aa_publicity['publicity'] = (cdr3_aa_publicity['unique_patients'] > 1).astype(int)

# Merge the publicity information back into the original dataframe
zhang_data_combined = zhang_data_combined.merge(cdr3_aa_publicity[['cdr3_aa', 'publicity', 'unique_patients']], on='cdr3_aa', how='left')

#### Export

In [None]:
zhang_data_combined.to_csv("~/CRC_Project/single_cell_data/Zhang_TCR_CRC_with_TissueType_cleaned.csv", index=False)