In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scAnalysis as scrna
import scanpy as sc
import seaborn as sns
import scvelo as scv


import warnings
warnings.filterwarnings("ignore")

scv.__version__
scv.settings.presenter_view = True  # set max width size for presenter view
scv.settings.set_figure_params('scvelo')  # for beautified visualization


In [None]:
results_file = 'final_anndatas/MGH707.h5ad'#HCC4006.h5ad'#'HCC4006.h5ad'#
cellline='MGH707'


adata = sc.read(results_file)

#df=scrna.andata2df(adata)
#df=df[['Barcode','cloneid','louvain','timepoint']]
#df.to_csv('to_josh/clone2louvainMGH707.csv')

In [None]:
def do_pca(dfin,features,Npca=10,number_genes=[],zscore=True):
    #number_genes is the number of genes used to compute PCA, ordered bu std
    
    
    
    if number_genes==[]:
        features2=features
    else:
        number_genes=min(number_genes,len(features))
        df=dfin.copy()
        df.loc['stds',:]=[0]*len(df.columns)
        df.loc['stds',features] = df.loc[:,features].std(axis=0,ddof=0)/df.loc[:,features].mean(axis=0)
        df=df.sort_values('stds',axis=1,ascending=False)
        features2 = df.columns[0:number_genes]
        
    from sklearn.decomposition import PCA
    

    
    #Compte actual PCs using all samples
    print('Computing PCs...')
    X = dfin.loc[:,features2].dropna()
    if zscore:
        for col in X.columns:
            X[col] = (X[col] - X[col].mean())/X[col].std(ddof=0)
    X=X.fillna(0)
    pca = PCA(n_components=Npca)
    PCs = pca.fit_transform(X)
    perc = 100*pca.explained_variance_ratio_.sum()
    print('Done!')

    print('We use '+str(Npca)+' components to explain '+str(perc)+'% of the variability')

    #create DF with PCA results 
    dfout = dfin.copy()
    pclist = []
    for i in range(PCs.shape[1]):
        dfout.loc[:,'PC'+str(i+1)] = PCs[:,i]
        pclist.append('PC'+str(i+1))

    print(pca.explained_variance_ratio_)  
    print(pca.explained_variance_ratio_.sum())
    
    loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
    loads = pd.DataFrame(loadings,index=features2,columns = pclist)
    loads =loads.sort_values('PC1',ascending=False)
    
    return dfout, loads

In [None]:
adata

In [None]:
clustering='louvain'

In [None]:
sc.tl.rank_genes_groups(adata, clustering, method='t-test')
p=sc.pl.rank_genes_groups_dotplot(adata, n_genes=4)

#extract cluster order from dendogram and use it to create a graded colormap. 
#Then use that colormap to color the clusters in the barplots, so more similar ones have similar colors
order=adata.uns['dendrogram_'+clustering]['categories_ordered']

from matplotlib import cm
colors = [cm.viridis(x) for x in np.linspace(0, 1, len(order))]
colors_dict={order[i]:colors[i] for i in range(len(order))}


In [None]:
result =adata.uns['rank_genes_groups']

# sc.pl.rank_genes_groups(andataaux, n_genes=30)
groups = result['names'].dtype.names
df = pd.DataFrame(columns=groups)#{key: result[key][group] for group in groups for key in ['names', 'scores']})

for c in df.columns:
    df[c]=result['names'][c]
    
df=df.head(50)
df.to_csv('figuresNEW/'+cellline+'/DEgenesLouvain'+cellline+'.csv')

In [None]:
#DE genes time 14day

adataaux=adata[adata.obs['timepoint']=='14day']

df=scrna.andata2df(adataaux)
df=df[['louvain','Barcode']].groupby('louvain').count()
df=df.loc[df['Barcode']>1]
ids=df.index.tolist()
adataaux=adataaux[np.isin(adataaux.obs['louvain'],ids)]

sc.tl.rank_genes_groups(adataaux, clustering, method='t-test')
sc.tl.dendrogram(adataaux,groupby='louvain')
p=sc.pl.rank_genes_groups_dotplot(adataaux, n_genes=4)

result =adataaux.uns['rank_genes_groups']

# sc.pl.rank_genes_groups(andataaux, n_genes=30)
groups = result['names'].dtype.names
df = pd.DataFrame(columns=groups)#{key: result[key][group] for group in groups for key in ['names', 'scores']})

for c in df.columns:
    df[c]=result['names'][c]
    
df=df.head(50)
df.to_csv('figuresNEW/'+cellline+'/DEgenesLouvain'+cellline+'_14day.csv')

In [None]:
#cloneid
df=scrna.andata2df(adata)

gbdf=df[['Barcode',clustering,'cloneid']].groupby([clustering,'cloneid']).count()
gbdf=gbdf.reset_index()
gbdf=gbdf.pivot(index=clustering, columns='cloneid', values='Barcode')

#discard barcodes with no asigned clone
gbdf=gbdf[[i for i in gbdf.columns if '-' not in i]]

#discard clones with no asigned barcodes
gbdf=gbdf[[c for c in gbdf.columns if gbdf[c].sum()>0]]

#discard louvains with no barcodes in them
gbdf=gbdf.loc[[c for c in gbdf.index if gbdf.loc[c].sum()>0]]


#normalize by total number of clones
gbdfraw=gbdf.copy()
gbdf=gbdf/gbdf.sum()
#gbdf=gbdf.dropna(axis=1)


#--------------------------------------------

minnumbercells=40

dfin=gbdfraw.copy()
dfin.loc['sum']=dfin.sum()
dfin=dfin.sort_values('sum',ascending=False,axis=1)
sorted_clones=dfin.columns
dfin=dfin[[c for c in dfin.columns if dfin.loc['sum',c]>minnumbercells]]
df=dfin/dfin.loc['sum']

sorted_clones_filtered=dfin.columns


fig, ax = plt.subplots(1, 1, figsize=(0.3*len(df.columns), 5))
# The position of the bars on the x-axis
r = df.columns

# Names of group and bar width
names = df.columns
barWidth = 1

yold=df.loc[df.index[0]]*0
for c in order:#[c for c in df.index if 'sum' not in c]:
    y=df.loc[c]
    ax.bar(r, y, bottom=yold, edgecolor='white', color=colors_dict[c],width=barWidth,label=c)
    yold=yold+y.copy()
    
    
# Custom X axis
ax.set_xticks(r)
ax.set_xlabel("clone id")
ax.set_xticklabels(names.tolist(), rotation = 90)
ax.legend(loc='center left', bbox_to_anchor=(1.15, 0.5))
ax.set_ylabel('proportion of cells')

ax2 = ax.twinx() 
ax2.plot(r,dfin.loc['sum'],'lightgray',linewidth=3)
ax2.set_ylabel('number fo cells',color='gray') 

plt.show()

In [None]:
clones_to_use = sorted_clones_filtered

fig, axs = plt.subplots(2, 2, figsize=(13,10),sharex=True,sharey=True)
axs=axs.ravel()
k=0
dfin=scrna.andata2df(adata)
for t in adata.obs['timepoint'].unique():
    ax=axs[k]
    
    df2=dfin.loc[dfin['timepoint']==t]

    gbdf=df2[['Barcode',clustering,'cloneid']].groupby([clustering,'cloneid']).count()
    gbdf=gbdf.reset_index()
    gbdf=gbdf.pivot(index=clustering, columns='cloneid', values='Barcode')

    #discard barcodes with no asigned clone
    gbdf=gbdf[[i for i in gbdf.columns if '-' not in i]]

    #discard clones with no asigned barcodes
    #gbdf=gbdf[[c for c in gbdf.columns if gbdf[c].sum()>0]]

    #discard louvains with no barcodes in them
    #gbdf=gbdf.loc[[c for c in gbdf.index if gbdf.loc[c].sum()>0]]

    #sort uisng previous data
    gbdf=gbdf[clones_to_use]
    
    #normalize by total number of clones
    gbdfraw=gbdf.copy()
    gbdf=gbdf/gbdf.sum()
    
    
    dfin2=gbdfraw.copy()
    dfin2.loc['sum']=dfin2.sum()
    #dfin2=dfin2[[c for c in dfin2.columns if dfin2.loc['sum',c]>5]]
    df=dfin2/dfin2.loc['sum']

    # The position of the bars on the x-axis
    r = df.columns

    # Names of group and bar width
    names = df.columns
    barWidth = 1

    yold=df.loc[df.index[0]]*0
    for c in order:#[c for c in df.index if 'sum' not in c]:
        y=df.loc[c]
        ax.bar(r, y, bottom=yold, edgecolor='white', color=colors_dict[c],width=barWidth,label=c)
        yold=yold+y.copy()

    # Custom X axis
    ax.set_xticks(r)
    ax.set_xlabel("clone id")
    ax.set_xticklabels(names.tolist(), rotation = 90)
    if k==3:
        ax.legend(title='louvain',loc='center left', bbox_to_anchor=(1.2, 0.5))
    ax.set_ylabel('proportion of cells')
    
    ax.set_title(t)

    ax2 = ax.twinx() 
    ax2.plot(r,dfin2.loc['sum'],'white',linewidth=3)
    ax2.set_ylabel('number fo cells',color='gray') 
    
    
    #compute metric
    #compute average correlation between the most abundant louvain and the rest
    df2=gbdfraw.copy()
    for c in df2.columns:

        dfaux=df2[[c]].loc[df2[c]>0]
        
        if len(dfaux)>0:

            most_abundant=int(dfaux.loc[dfaux[c]==max(dfaux[c])].index.values[0])

            dfaux['wcorr']=[adata.uns['dendrogram_'+clustering]['correlation_matrix'][most_abundant,int(i)]*dfaux.loc[i,c] for i in dfaux.index]

            #avg correl
            suma=dfaux.sum()

            avcorrel=suma['wcorr']/suma[c]
        else:
            dfaux['wcorr']=0
            avcorrel=0

        #print(avcorrel)
    
    
    k=k+1
plt.tight_layout()
fig.savefig("figuresNEW/"+cellline+'/prop_cells_perclone'+cellline+".pdf",bbox_inches='tight')
plt.show()

In [None]:
#timepoint
df=scrna.andata2df(adata)

gbdf=df[['Barcode',clustering,'timepoint']].groupby([clustering,'timepoint']).count()
gbdf=gbdf.reset_index()
gbdf=gbdf.pivot(index=clustering, columns='timepoint', values='Barcode')

#discard barcodes with no asigned clone
#gbdf=gbdf[[i for i in gbdf.columns if '-' not in i]]

#discard clones with no asigned barcodes
#gbdf=gbdf[[c for c in gbdf.columns if gbdf[c].sum()>0]]

#discard louvains with no barcodes in them
#gbdf=gbdf.loc[[c for c in gbdf.index if gbdf.loc[c].sum()>0]]

#normalize by louvain
gbdf=gbdf.T
prop_cellsraw=gbdf.copy()
gbdf=gbdf/gbdf.sum()
gbdf=gbdf.dropna(axis=1)

prop_cells=gbdf.T

In [None]:
from cycler import cycler
default_cycler = (cycler(color=['purple','orangered','gold','turquoise']) +
                  cycler(linestyle=['-', '--', ':', '-.']))
plt.rc('lines', linewidth=4)
plt.rc('axes', prop_cycle=default_cycler)

df=prop_cellsraw.copy()


fig, ax = plt.subplots(1, 1, figsize=(10, 5))
# The position of the bars on the x-axis
r = df.columns

# Names of group and bar width
names = df.columns
barWidth = 1

# Create brown bars
#for c in df.index:
#    y=df.loc[c]
#    ax.bar(r, y, edgecolor='white', width=barWidth,label=c)

y_offset=0
colors_dict={'0hr':'purple','14day':'orangered','16day':'gold','24r':'turquoise'}
for row in range(len(df.index)):
    index=df.iloc[row].index
    plt.bar(index, df.iloc[row], barWidth, bottom=y_offset,label=df.iloc[[row]].index[0])#,color=colors_dict[index])
    y_offset = y_offset + df.iloc[row]
    
# Custom X axis
ax.set_xticks(r)
ax.set_xlabel(clustering)
ax.set_ylabel("number of cells")
ax.set_xticklabels(names.tolist(), rotation = 0)
ax.legend()

fig.savefig("figuresNEW/"+cellline+'/number_cell_pertimepoint-louvain'+cellline+".pdf",bbox_inches='tight')

plt.show()



In [None]:
index

In [None]:
#for each clone, look at its proportion in each louvain at each timepoint. Using this info we can score transitions between louvain clusters


adata2 = adata[['-' not in i for i in adata.obs['cloneid']]]

#,
#                data=np.zeros([len(adata2.obs['louvain'].unique()),4]))

clones=adata2.obs['cloneid'].unique()

dfnorm_dict={c:[] for c in clones}
df_dict={c:[] for c in clones}
for c in clones:
    
    df=pd.DataFrame(index=adata2.obs['louvain'].unique().sort_values(),columns=['0hr','24hr','14day','16day'])
    
    adata3= adata2[adata2.obs['cloneid']==c]
    
    dff=scrna.andata2df(adata3)
    dff=dff[['timepoint','Barcode','louvain']].groupby(['timepoint','louvain']).count().reset_index()

    dff=dff.pivot(index='louvain', columns='timepoint', values='Barcode')#[['0hr','24hr','14day','16day']]
    
    df=df.combine_first(dff)
    df=df.fillna(0)
    
    df=df.loc[adata2.obs['louvain'].unique().sort_values(),['0hr','24hr','14day','16day']]
    

    df_dict[c]=df
    
    dfaux=df/df.sum()
    dfnorm_dict[c]=dfaux.fillna(0)
    
    #fig, ax = plt.subplots(1, 1, figsize=(10, 5))
    #sns.heatmap(df/df.sum(),ax=ax)
    #sns.heatmap(df,ax=ax)

In [None]:
# create a dataframe with index=clones, columns=[louvain0hr,louvain24hr,louvain14day,louvain16day]
k=0
for c,df in dfnorm_dict.items():
    df['louvain']=df.index
    df=df.melt(id_vars=['louvain'])
    df['louvain_time']=df['louvain'].astype(str)+'_'+df['variable']
    df.index=df['louvain_time']
    df=df[['value']]
    df=df.T
    df['cloneid']=c
    if k==0:
        dfcoord=df.copy()
    else:
        dfcoord=dfcoord.append(df,ignore_index=True)
    k=k+1
    
dfcoord.index=dfcoord['cloneid']
dfcoord=dfcoord[[c for c in dfcoord.columns if 'cloneid' not in c]]

In [None]:
plot=sns.clustermap(dfcoord,col_cluster=False,z_score=0,cbar_kws={'label': 'z_score'})
fig = plot.fig
fig.savefig("figuresNEW/"+cellline+'/heatmap_cell_prop'+cellline+".pdf",bbox_inches='tight')

In [None]:
def add_kmeans(df,n_clusters=4):

    import sklearn.cluster as cluster
    import seaborn as sns

    features=[i for i in df.columns if '_' in i]
    X = df.loc[:,features]

    #clustering = cluster.KMeans(n_clusters=n_clusters)#.AffinityPropagation()#n_clusters=n_clusters)#
    clustering = cluster.SpectralClustering(n_clusters=n_clusters)#.AffinityPropagation()#n_clusters=n_clusters)#
    
    clustering.fit(X)

    df2=df.copy()
    df2['clustering']=[str(i) for i in clustering.labels_]
    
    #sns.lmplot(data=df2,x='PC1',y='PC2',hue='clustering',fit_reg=False)
    #for i in range(len(df)):
    #    plt.text(df2.iloc[i]['PC1'],df2.iloc[i]['PC2'],str(i))
    #plt.show()
    
    return df2

#feats=[c for c in dfcoord.columns if '_' in c]
#dfcoord,load = do_pca(dfcoord,feats,Npca=5,number_genes=[],zscore=True)
min_number_cells=100 #minimum number of cells per traj class to determine number of traj classes


nclust=8
done=False
while done==False:
    dfcoord=add_kmeans(dfcoord,n_clusters=nclust)

    diction={i:dfcoord['clustering'].loc[i] for i in dfcoord.index}
    s1=adata2.obs['cloneid']
    s1=s1.map(diction)
    adata2.obs['trajectory_class']=s1
    adata2.obs['trajectory_class']=adata2.obs['trajectory_class'].replace(np.nan,'-')

    df=scrna.andata2df(adata2)
    df=df[['trajectory_class','Barcode']].groupby(['trajectory_class']).count().reset_index()
    lista=df['Barcode']
    done=all(i >= min_number_cells for i in lista)
    
    nclust=nclust-1
    
nclust=nclust+1
print('optimal number of clusters',nclust)
df

In [None]:
#feats=[c for c in dfcoord.columns if '_' in c]
#dfcoord,load = do_pca(dfcoord,feats,Npca=5,number_genes=[],zscore=True)
min_number_clones=2 #minimum number of clones per traj class and timepoint to determine number of traj classes


nclust=8
done=False
while done==False:
    dfcoord=add_kmeans(dfcoord,n_clusters=nclust)

    diction={i:dfcoord['clustering'].loc[i] for i in dfcoord.index}
    s1=adata2.obs['cloneid']
    s1=s1.map(diction)
    adata2.obs['trajectory_class']=s1
    adata2.obs['trajectory_class']=adata2.obs['trajectory_class'].replace(np.nan,'-')

    df=scrna.andata2df(adata2)
    df=df[['trajectory_class','timepoint','cloneid']].groupby(['trajectory_class','timepoint']).nunique().reset_index()
    lista=df['cloneid']
    done=all(i >= min_number_clones for i in lista)
    
    nclust=nclust-1
    
nclust=nclust+1
print('optimal number of clusters',nclust)
df

In [None]:
df=scrna.andata2df(adata2)
df.loc[df['trajectory_class']=='2']['cloneid'].values.unique()

In [None]:
adataaux=adata2[adata2.obs['trajectory_class']=='0']
sc.pl.umap(adataaux,color=['timepoint'],size=10)

sc.pl.umap(adataaux,color=['cloneid'],size=10)

nclust=5
dfcoord=add_kmeans(dfcoord,n_clusters=nclust)

diction={i:dfcoord['clustering'].loc[i] for i in dfcoord.index}
s1=adata2.obs['cloneid']
s1=s1.map(diction)
adata2.obs['trajectory_class']=s1
adata2.obs['trajectory_class']=adata2.obs['trajectory_class'].replace(np.nan,'-')

In [None]:
#add trajectory type into the andata file
diction={i:dfcoord['clustering'].loc[i] for i in dfcoord.index}
s1=adata.obs['cloneid']
s1=s1.map(diction)
adata.obs['trajectory_class']=s1
adata.obs['trajectory_class']=adata.obs['trajectory_class'].replace(np.nan,'-')


diction={i:dfcoord['clustering'].loc[i] for i in dfcoord.index}
s1=adata2.obs['cloneid']
s1=s1.map(diction)
adata2.obs['trajectory_class']=s1
adata2.obs['trajectory_class']=adata2.obs['trajectory_class'].replace(np.nan,'-')

In [None]:
fig, ax = plt.subplots(2,3, figsize=(14, 10))
ax=ax.ravel()
kk=0
for tc in adata2.obs['trajectory_class'].unique():
    axs=ax[kk]
    adataaux=adata2[adata2.obs['trajectory_class']==tc]
    dff=scrna.andata2df(adata2)
    #axs.scatter(dff['UMAP1'],dff['UMAP2'],color='lightgray',alpha=0.3)
    
    
    stratify='timepoint'
    feat='UMAP'
    colors = ['purple','orangered','gold','turquoise']#['darkred','violet','green','b']
    xs = feat + '1'
    ys = feat + '2'
    expr = scrna.andata2df(adataaux)
    expr=expr.sort_values(stratify)
    axw = axs
    for c, color in zip(expr[stratify].unique(), colors):  # print cell type text
        mask = expr[stratify] == c
        expraux = expr.loc[mask, [xs, ys, stratify]]
        axw.scatter(expraux[xs], expraux[ys], c=color, edgecolor='lightgray', s=80, marker='.', label=c,
                    linewidths=.3,alpha=0.5)
        xav = np.mean(expraux[xs])
        yav = np.mean(expraux[ys])
        ctype = expraux[stratify].values[0]

    axw.set_xlabel(xs)
    axw.set_ylabel(ys)


    axs.set_xticks([])
    
    axw.legend()
    

    scv.pl.scatter(adata2, color='louvain', size=80,ax=axs,add_outline=True,alpha=0,show=False)
          
    axs.set_title('traj class:'+tc)
    #sc.pl.umap(adata,alpha=0.1, s=100,ax=axs)
    kk=kk+1
plt.tight_layout()
plt.show()

fig.savefig("figuresNEW/"+cellline+"/UMAPS_final/"+cellline+"traj_classesUMAP.png", dpi=300, bbox_inches='tight')

In [None]:
adata.write('final_anndatasNEW/'+cellline+'.h5ad')