In [None]:
import re
import pandas as pd
from collections import defaultdict

## Functions

In [None]:
#Find out sharing level of each TCR
def find_sharing_level(tcr,all_tcr_df):
    return all_tcr_df.loc[tcr,'Number of Common Donors']

In [None]:
#Classify Sharing Levels
def classify_sharing(sharing_num): 
    if sharing_num == 1:
        return "Private"
    else:
        return "Public"

In [None]:
def clean_tcr_sequences(tcr):
    
    #Remove Alleles
    if "*01" in tcr:
        tcr = tcr.replace("*01","")
    if "*02" in tcr:
        tcr = tcr.replace("*02","")
    return tcr    

In [None]:
def classify_antigen_tcr(tcr,flu_sequences,cmv_sequences):
    if tcr in flu_sequences and tcr not in cmv_sequences:
        antigen_spec = 'FLU-M1'
    elif tcr not in flu_sequences and tcr in cmv_sequences:
        antigen_spec = 'CMV-pp65'
    elif tcr in flu_sequences and tcr in cmv_sequences:
        antigen_spec = 'FLU-M1 and CMV-pp65'
    else:
        antigen_spec = 'No Antigen Specificity'
    return antigen_spec

## Read Files

Use CSV file not based on donor-visit to classify each TCR as private or public

In [None]:
input_file_all = pd.read_csv('./C_Result_Files/CD8_redo_no_umi/CD8_Donors_all_Alpha_Common_TCRs.csv',
                             index_col = 0,header = 0)
input_file_visit = pd.read_csv('./C_Result_Files/CD8_redo_no_umi_visit/CD8_Donors_all_Alpha_Common_TCRs.csv',
                               index_col = 0,header = 0)

In [None]:
input_file_all.head()

In [None]:
input_file_visit.head()

In [None]:
print(input_file_all.shape)

In [None]:
print(input_file_visit.shape)

In [None]:
input_file_visit['TCR'] = input_file_all.index.values.tolist()

In [None]:
input_file_visit['Number of Common Donors'] = input_file_visit['TCR'].apply(lambda x: find_sharing_level(x,input_file_all))
input_file_visit['Sharing Level'] = input_file_visit['Number of Common Donors'].apply(lambda x: classify_sharing(x))

In [None]:
input_file_visit.head()

In [None]:
#Calculate Total UMI values for each donor
donor_tot_umi = defaultdict(int)
for donor in input_file_visit.columns.values.tolist()[:-4]:
    donor_tot_umi[donor] = input_file_visit[donor].sum()

In [None]:
#Set up Sharing Levels-Total TCRs only
sharing_per = pd.DataFrame(index = input_file_visit.columns.values[:-4],
                          columns = ["Private","Public"])
for sharing_level in sorted(set(input_file_visit['Sharing Level'].values)):
    fil_input_file_visit = input_file_visit[input_file_visit['Sharing Level'] == sharing_level]
    for donor in input_file_visit.columns.values.tolist()[:-4]:
        sharing_per.loc[donor,sharing_level] = (float(100)*fil_input_file_visit[donor].sum())/donor_tot_umi[donor]

In [None]:
sharing_per

In [None]:
sharing_per.to_csv('./C_Result_Files/Sharing_Percentages/CD8_Alpha_Sharing_Percentages.csv')