# Analysis Part II - Merging the data of all experiments at the same time

In [None]:
%load_ext autoreload
%matplotlib inline

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings(action='ignore')
import os
import scanpy as sc
import scirpy as ir
import anndata as ann
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from matplotlib import rcParams
from mudata import MuData
import mudata

import tarfile
import warnings
from glob import glob

import anndata
import matplotlib.pyplot as plt
import muon as mu
import pandas as pd
import scanpy as sc
import scirpy as ir

%autoreload 2
import sys
sys.path.append('..')
import utility.annotation as utils_annotation
import utility.representation as utils_representation
import utility.visualisation as utils_vis

In [None]:
sc.settings.set_figure_params(dpi=150)
sc.settings.verbosity = 3
sc.set_figure_params(vector_friendly=True, color_map='viridis', transparent=True)
sb.set_style('whitegrid')

colormap = 'flare'

## Get input data from the different samples

In [None]:
adatas = []

for i in range(1, 10):
    adata_tmp = mu.read(f'/media/agschober/HDD12/3_scRNA-Seq_Sina/1_Preprocessing/data{i}.h5mu')
    adatas.append(adata_tmp)

In [None]:
mdata = []
airrdata = []

#concatenate GEX
mdata = adatas[0]["gex"].concatenate(adatas[1]["gex"], adatas[2]["gex"], adatas[3]["gex"],
                                    adatas[4]["gex"], adatas[5]["gex"], adatas[6]["gex"], adatas[7]["gex"],
                                    adatas[8]["gex"])

#concatenate AIRR
airrdata = adatas[0]["airr"].concatenate(adatas[1]["airr"], adatas[2]["airr"], adatas[3]["airr"],
                                        adatas[4]["airr"], adatas[5]["airr"], adatas[6]["airr"],
                                        adatas[7]["airr"], adatas[8]["airr"])

#fuse AIRR and GEX
mdata = mu.MuData({'gex': mdata, 'airr': airrdata})

In [None]:
# Look at occurence of samples
pd.DataFrame(mdata["gex"].obs['pool'].value_counts())

In [None]:
samples = list(pd.DataFrame(mdata["gex"].obs['pool'].value_counts()).reset_index()['pool'])
counts = list(pd.DataFrame(mdata["gex"].obs['pool'].value_counts()).reset_index()['count'])

plt.figure(figsize=(16,6))
plt.bar(samples, counts, label=samples)
plt.ylabel('cells')
plt.xlabel('sample')
plt.title('Cells per Sample')
plt.tick_params(axis='x', labelrotation=90)
plt.show()

## Pool level annotation

In [None]:
# Create list of epitopes
epitopes = ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672',
               'NS2A97', 'NS4B165', 'COV', 'HHV', 'FLU', 'EBV1', 'EBV2']
mdata["gex"].uns['epitopes'] = epitopes
mdata["gex"].obsm['epitopes'] = mdata["gex"].obs[epitopes]

In [None]:
pool_2_epitope = {
    'sample1sample1': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample1sample2': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'], 
    'sample1sample3': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'], 
    'sample1sample4': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'], 
    'sample1sample5': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'], 
    'sample1sample6': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'], 
    'sample1sample7': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'], 
    'sample1sample8': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'], 
    'sample2sample1': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample2sample2': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample2sample3': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample2sample4': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample2sample5': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample2sample6': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample2sample7': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample2sample8': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample3sample1': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165', 'COV', 'HHV', 'FLU', 'EBV1', 'EBV2'],
    'sample3sample2': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample3sample3': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165', 'COV', 'HHV', 'FLU', 'EBV1', 'EBV2'],
    'sample3sample4': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample3sample5': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample3sample6': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165', 'COV', 'HHV', 'FLU', 'EBV1', 'EBV2'],
    'sample3sample7': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165', 'COV', 'HHV', 'FLU', 'EBV1', 'EBV2'],
    'sample3sample8': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample4sample1': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165', 'COV', 'HHV', 'FLU', 'EBV1', 'EBV2'],
    'sample4sample2': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample4sample3': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165', 'COV', 'HHV', 'FLU', 'EBV1', 'EBV2'],
    'sample4sample4': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample4sample5': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample4sample6': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample4sample7': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample4sample8': ['NS4B214', 'NS2B117', 'NS3293', 'NS3286', 'NS324', 'NS5672', 'NS2A97', 'NS4B165'],
    'sample5sample5sample1': ['NS4B214'],
    'sample5sample5sample2': ['NS4B214'], 
    'sample5sample5sample3': ['NS4B214'], 
    'sample5sample5sample4': ['NS4B214'], 
    'sample5sample5sample5': ['NS4B214'], 
    'sample5sample5sample6': ['NS4B214'], 
    'sample5sample5sample7': ['NS4B214'], 
    'sample5sample5sample8': ['NS4B214'], 
    'sample6sample1': ['NS4B214'],
    'sample6sample2': ['NS4B214'],
    'sample6sample3': ['NS4B214'],
    'sample6sample4': ['NS4B214'],
    'sample6sample5': ['NS4B214'],
    'sample6sample6': ['NS4B214'],
    'sample6sample7': ['NS4B214'],
    'sample6sample8': ['NS4B214'],
    'sample7sample1': ['NS4B214'],
    'sample7sample2': ['NS4B214'],
    'sample7sample3': ['NS4B214'],
    'sample7sample4': ['NS4B214'],
    'sample7sample5': ['NS4B214'],
    'sample7sample6': ['NS4B214'],
    'sample7sample7': ['NS4B214'],
    'sample7sample8': ['NS4B214'],
    'sample8sample1': ['NS4B214'],
    'sample8sample2': ['NS4B214'],
    'sample8sample3': ['NS4B214'],
    'sample8sample4': ['NS4B214'],
    'sample8sample5': ['NS4B214'],
    'sample8sample6': ['NS4B214'],
    'sample8sample7': ['NS4B214'],
    'sample8sample8': ['NS4B214'],
    'sample9sample1': ['NS4B214'],
    'sample9sample2': ['NS4B214'], 
    'sample9sample3': ['NS4B214'], 
    'sample9sample4': ['NS4B214'], 
    'sample9sample5': ['NS4B214'], 
    'sample9sample6': ['NS4B214'], 
    'sample9sample7': ['NS4B214'], 
    'sample9sample8': ['NS4B214']}

In [None]:
pool_annotation = {
    'sample1sample1': ['D1', 'd7'],
    'sample1sample2': ['D1', 'd11'], 
    'sample1sample3': ['D1', 'd14'], 
    'sample1sample4': ['D1', 'd21'], 
    'sample1sample5': ['D1', 'd28'], 
    'sample1sample6': ['D2', 'd7'], 
    'sample1sample7': ['D2', 'd11'], 
    'sample1sample8': ['D2', 'd14'], 
    'sample2sample1': ['D5', 'd7'],
    'sample2sample2': ['D5', 'd11'],
    'sample2sample3': ['D5', 'd14'],
    'sample2sample4': ['D5', 'd21'],
    'sample2sample5': ['D5', 'd28'],
    'sample2sample6': ['B46', 'd49'],
    'sample2sample7': ['B46', 'd21'],
    'sample2sample8': ['B46', 'd28'],
    'sample3sample1': ['C12', 'dx'],
    'sample3sample2': ['C5', 'dx'],
    'sample3sample3': ['B11', 'd14'],
    'sample3sample4': ['B11', 'd365'],
    'sample3sample5': ['B19', 'd14'],
    'sample3sample6': ['B19', 'd365'],
    'sample3sample7': ['B34', 'd14'],
    'sample3sample8': ['B34', 'd365'],
    'sample4sample1': ['B15', 'd14'],
    'sample4sample2': ['B15', 'd365'],
    'sample4sample3': ['B20', 'd14'],
    'sample4sample4': ['B20', 'd365'],
    'sample4sample5': ['B7', 'd14'],
    'sample4sample6': ['B7', 'd365'],
    'sample4sample7': ['B46', 'd90'],
    'sample4sample8': ['C11', 'dx'],
    'sample5sample5sample1': ['D2', 'd21'],
    'sample5sample5sample2': ['D2', 'd28'], 
    'sample5sample5sample3': ['D2', 'd49'], 
    'sample5sample5sample4': ['D2', 'd90'], 
    'sample5sample5sample5': ['B18', 'd14'], 
    'sample5sample5sample6': ['B18', 'd365'], 
    'sample5sample5sample7': ['B13', 'd14'], 
    'sample5sample5sample8': ['B13', 'd365'], 
    'sample6sample1': ['D5', 'd7'],
    'sample6sample2': ['D5', 'd11'],
    'sample6sample3': ['D5', 'd14'],
    'sample6sample4': ['D5', 'd21'],
    'sample6sample5': ['D5', 'd28'],
    'sample6sample6': ['D5', 'd49'],
    'sample6sample7': ['D5', 'd90'],
    'sample6sample8': ['B42', 'd11'],
    'sample7sample1': ['D1', 'd14'],
    'sample7sample2': ['D1', 'd49'],
    'sample7sample3': ['D1', 'd90'],
    'sample7sample4': ['B21', 'd14'],
    'sample7sample5': ['B21', 'd365'],
    'sample7sample6': ['B7', 'd14'],
    'sample7sample7': ['B7', 'd365'],
    'sample7sample8': ['B35', 'd11'],
    'sample8sample1': ['B19', 'd14'],
    'sample8sample2': ['B19', 'd365'],
    'sample8sample3': ['B40', 'd14'],
    'sample8sample4': ['B40', 'd365'],
    'sample8sample5': ['B46', 'd14'],
    'sample8sample6': ['B46', 'd21'],
    'sample8sample7': ['B27', 'd21'],
    'sample8sample8': ['B41', 'd90'],
    'sample9sample1': ['C5', 'dx'],
    'sample9sample2': ['C7', 'dx'], 
    'sample9sample3': ['C11', 'dx'], 
    'sample9sample4': ['C12', 'dx'], 
    'sample9sample5': ['C15', 'dx'], 
    'sample9sample6': ['A21', 'd0'], 
    'sample9sample7': ['PH', 'd0'], 
    'sample9sample8': ['A7', 'd0']}
pool_annotation = pd.DataFrame(pool_annotation, index=['donor', 'time']).transpose()
pool_annotation

In [None]:
for col in pool_annotation.columns:
    mdata["gex"].obs[col] = mdata["gex"].obs['pool'].map(pool_annotation[col])

In [None]:
samples = list(pd.DataFrame(mdata["gex"].obs['donor'].value_counts()).reset_index()['donor'])
counts = list(pd.DataFrame(mdata["gex"].obs['donor'].value_counts()).reset_index()['count'])

plt.figure(figsize=(16,6))
plt.bar(samples, counts, label=samples)
plt.ylabel('cells')
plt.xlabel('donor')
plt.title('Cells per Donor')
plt.tick_params(axis='x', labelrotation=90)
plt.show()

## Cell Filtering based on Phenotypes

### Initial UMAP and leiden

In [None]:
sc.pp.normalize_total(mdata.mod['gex'])
sc.pp.log1p(mdata.mod['gex'])

In [None]:
#need to correct for batch effect
sc.pp.combat(mdata['gex'], key='sample')

In [None]:
utils_representation.calculate_umap(mdata["gex"], n_high_var=5000, remove_tcr_genes=True)

In [None]:
utils_representation.calculate_leiden(mdata["gex"], resolution=3.0, n_high_var=5000, remove_tcr_genes=True)

In [None]:
sc.pl.umap(mdata["gex"], color='leiden')
sc.pl.umap(mdata["gex"], color=['donor', 'time'])
sc.pl.umap(mdata["gex"], color=['sample', 'pool'])

In [None]:
utils_vis.separate_umaps_by_condition(mdata["gex"], 'leiden', 7, 6, do_int_sort=True)

In [None]:
utils_annotation.add_seumois_score(mdata["gex"])

In [None]:
utils_annotation.add_all_scores(mdata["gex"])

In [None]:
utils_vis.plot_marker_genes(mdata["gex"])

In [None]:
sc.pl.umap(mdata["gex"], color=['chain_pairing'])

In [None]:
ir.pl.group_abundance(mdata["gex"], groupby='leiden', target_col='chain_pairing', 
                      normalize=True, fig_kws={'figsize': (12, 5)})

In [None]:
ir.pl.group_abundance(mdata["gex"], groupby='pool', target_col='chain_pairing', 
                      normalize=True, fig_kws={'figsize': (12, 5)})

In [None]:
print('Before Filtering Cluster: ', len(mdata))
clusters_remove = ['33', '35']
mdata = mdata[~mdata["gex"].obs['leiden'].isin(clusters_remove)]
print('After Filtering Cluster: ', len(mdata))

### Filter Cells without IR

In [None]:
print(f'Amount of cells: {len(mdata)}')
mdata = mdata[mdata["gex"].obs['chain_pairing']!='no_IR'].copy()
print(f'Amount of cells with IR: {len(mdata)}')

## Clonotype definition over merged data

In [None]:
ir.pp.index_chains(mdata)
ir.tl.chain_qc(mdata)
ir.pp.ir_dist(mdata, metric='identity', sequence='aa')
ir.tl.define_clonotype_clusters(mdata, metric='identity', receptor_arms='all', dual_ir='any', sequence='aa', 
                                key_added='clone_id')

In [None]:
mdata["airr"].obs["IR_VJ_1_junction_aa"] = ir.get.airr(mdata, "junction_aa", ["VJ_1"])
mdata["airr"].obs["IR_VJ_2_junction_aa"] = ir.get.airr(mdata, "junction_aa", ["VJ_2"])
mdata["airr"].obs["IR_VDJ_1_junction_aa"] = ir.get.airr(mdata, "junction_aa", ["VDJ_1"])
mdata["airr"].obs["IR_VDJ_2_junction_aa"] = ir.get.airr(mdata, "junction_aa", ["VDJ_2"])

mdata["airr"].obs["IR_VJ_1_v_call"] = ir.get.airr(mdata, "v_call", ["VJ_1"])
mdata["airr"].obs["IR_VJ_2_v_call"] = ir.get.airr(mdata, "v_call", ["VJ_2"])
mdata["airr"].obs["IR_VDJ_1_v_call"] = ir.get.airr(mdata, "v_call", ["VDJ_1"])
mdata["airr"].obs["IR_VDJ_2_v_call"] = ir.get.airr(mdata, "v_call", ["VDJ_2"])

mdata["airr"].obs["IR_VJ_1_j_call"] = ir.get.airr(mdata, "j_call", ["VJ_1"])
mdata["airr"].obs["IR_VJ_2_j_call"] = ir.get.airr(mdata, "j_call", ["VJ_2"])
mdata["airr"].obs["IR_VDJ_1_j_call"] = ir.get.airr(mdata, "j_call", ["VDJ_1"])
mdata["airr"].obs["IR_VDJ_2_j_call"] = ir.get.airr(mdata, "j_call", ["VDJ_2"])

mdata["airr"].obs["IR_VDJ_1_d_call"] = ir.get.airr(mdata, "d_call", ["VDJ_1"])
mdata["airr"].obs["IR_VDJ_2_d_call"] = ir.get.airr(mdata, "d_call", ["VDJ_2"])

In [None]:
mdata["airr"].obs.loc[mdata["airr"].obs['IR_VJ_1_junction_aa'].isna(), 'clone_id'] = np.nan
mdata["airr"].obs.loc[mdata["airr"].obs['IR_VDJ_1_junction_aa'].isna(), 'clone_id'] = np.nan
mdata["airr"].obs['clone_id'] = mdata["airr"].obs['clone_id'].astype(float)

ir.tl.clonal_expansion(mdata["airr"], target_col='clone_id', key_added='clone_size_clipped', clip_at=3)
ir.tl.clonal_expansion(mdata["airr"], target_col='clone_id', key_added='clone_size', clip_at=len(mdata))

In [None]:
for i in range(29968):
    n = '<= '+str(i)
    mdata["airr"].obs['clone_size'] = mdata["airr"].obs['clone_size'].replace(n, i)
mdata["airr"].obs['clone_size'] = mdata["airr"].obs['clone_size'].replace('> 29967', 29968)

In [None]:
mdata["airr"].obs['clone_size'] = (mdata["airr"].obs['clone_size']).astype(float)
mdata["gex"].obs['clone_size_clipped'] = mdata["airr"].obs['clone_size_clipped']
mdata["gex"].obs['clone_size'] = mdata["airr"].obs['clone_size']

In [None]:
sc.pl.umap(mdata["gex"], color=['clone_size_clipped', 'clone_size'])

## Extract Clonotype Information

In [None]:
utils_annotation.extract_clonotype_information(mdata["airr"], 'junction_aa', 'clonotype_sequence')
utils_annotation.extract_clonotype_information(mdata["airr"], 'v_call', 'v_genes')
utils_annotation.extract_clonotype_information(mdata["airr"], 'j_call', 'j_genes')

## Assign MAIT

In [None]:
def assign_mait(row, gene_combination=True, cdr3=True):
    if gene_combination:
        if 'TRAJ33' in str(row['j_genes']) and 'TRAV1-2' in str(row['v_genes']):
            if 'TRBV20-1' in str(row['v_genes']) or 'TRBV6' in str(row['v_genes']):
                return 'True'
    if cdr3:
        if 'CAVMDSSYKLIF' in str(row['clonotype_sequence']):
            return 'True'
    return 'False'

In [None]:
mdata["gex"].obs['has_mait'] = mdata["airr"].obs.apply(assign_mait, axis=1)
sc.pl.umap(mdata["gex"], color='has_mait', groups='True')
mdata["gex"].obs['has_mait'].value_counts()

## New UMAP with final set of cells

In [None]:
utils_representation.calculate_umap(mdata["gex"], n_high_var=5000, remove_tcr_genes=True)
utils_representation.calculate_leiden(mdata["gex"], resolution=1, n_high_var=5000, remove_tcr_genes=True)

In [None]:
sc.pl.umap(mdata["gex"], color=['leiden', 'sample', 'donor', 'pool', 'time'], ncols=2, show=False)
plt.tight_layout()
plt.show()

In [None]:
sb.set(rc={'figure.figsize':(5,5)})
sb.set_style("whitegrid")
for time in (mdata['gex'].obs['time']).unique():
    for ep in ['NS4B214']:
        ax = sc.pl.umap(mdata["gex"], show=False, size=30)
        sc.pl.umap(mdata["gex"][(mdata["gex"].obs['time']==time)], 
                       color='donor', ax = ax, show=False, size=60, cmap='Spectral_r')
        plt.title(str(time))
        plt.tight_layout()
        #plt.savefig(f'clone_{clone}_donor_ambiguous.png')
        plt.figsize=(10,10)
        plt.show()

## Save merged data

In [None]:
mdata.write("/media/agschober/HDD12/3_scRNA-Seq_Sina/2_Merge_Data/data_ALL_normalised.h5mu")

In [None]:
import session_info
session_info.show()