In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms
import math
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from datetime import date
#set date for fig names
today = date.today()
date = today.strftime('%m%d%y')
print(date)

In [None]:
#define paths
basedir='eclarki_genome_paper/'
gene_exp_dir=basedir+'06_Gene_expression/'


In [None]:
#paths to gene expression file and sample-condition file
expFile=gene_exp_dir+'lengthScaledTPM.matrix' #normalized gene expression matrix (input file)
sampleConditionFile=gene_exp_dir+'samples_conditions.txt' #sample-condition file groups samples for PCA
! head {sampleConditionFile} #check input file


In [None]:
#PCA of all samples (Bryopsis and Elysia)

In [None]:
#link samples
sampleConditionDict = {}

for line in open(sampleConditionFile, 'r'):
    line=line.rstrip().split('\t')
    if 'sample' in line[0]: #skip header
        continue
    sample,condition=line[0],line[1] #get samples, conditions from columns 1 and 2
    sampleConditionDict[sample]=condition #store in dictionary with sample as key

print('%i sample-condition pairs stored in sampleConditionDict\n' % len(sampleConditionDict.keys()))

df=pd.read_csv(expFile, header=0, sep='\t').transpose() #bring in gene expression matrix and transpose
display(df) #check dataframe




In [None]:
samples,conditions=[],[] #store list of all samples and conditions

for sample in df.index: #go through each sample in df
    if sample not in sampleConditionDict.keys(): #if sample not in samples/conditions file
        df.drop([sample],inplace=True) #delete from the matrix
        continue
    condition=sampleConditionDict[sample] #otherwise, store it
    conditions.append(condition)
    samples.append(sample)

print('%i unique conditions:' % len(set(conditions))) #check number of conditions is 16
print('\t'.join(set(conditions))+'\n') #print conditions
features=np.asarray(df.columns) #array of all genes in the dataframe
features
print('%i unique genes' % len(set(features)))

#standardscale dataframe
x=df.loc[:, features].values #store features (genes) as x
x=StandardScaler().fit_transform(x) #normalize features
x.shape #check shape of dataframe- first value=number of samples, second value=number of genes in matrix

#check if the mean=0 and std=1
print('mean: %s (%f)\nstandard deviation: %s (%f)' % (np.mean(x),round(np.mean(x)),np.std(x),round(np.std(x))))




In [None]:
feat_cols=['gene'+str(i) for i in range(x.shape[1])] #rename columns to 'gene'+number

normalised_df=pd.DataFrame(x,columns=feat_cols) #convert normalized data to a dataframe
display(normalised_df) #check dataframe



In [None]:
df.reset_index(inplace=True, drop=True) #change index to numbers
df.head() #check dataframe

In [None]:
#run PCA with n dimensions- PCA with all components

n_components=15 #number of PCs

columns=[]
for i in range(n_components): #add principal componenets to columns
    columns.append('principal component %s' % str(i+1))

pca_comp=PCA(n_components=n_components) 
principalComponents_comp=pca_comp.fit_transform(x) #transform PCA variables
print('Explained variation per principal component: {}'.format(pca_comp.explained_variance_ratio_)) #explained variance

principal_Df=pd.DataFrame(data=principalComponents_comp, columns=columns) #store outputs as df for plotting
df['label']=conditions #add label column
df['sample']=samples #add sample column
display(df.head())

#build scree plot
ind=np.arange(0, n_components) #range 1-n_components
(fig, ax) = plt.subplots(figsize=(8, 6)) #define figure size
sns.pointplot(x=ind, y=pca_comp.explained_variance_ratio_) #plot explained variance for each PC
ax.set_title('Scree plot')
ax.set_xticks(ind)
ax.set_xticklabels(i+1 for i in ind) #correct PC number
ax.set_xlabel('Component Number')
ax.set_ylabel('Explained Variance')
plt.show()

In [None]:
#Plotting settings
# print(set(conditions)) #use this to get list for targets (unordered)
sns.set_theme(style='white') #set background to white

targets=['Bp','E','V','L','AS'] #conditions in order (so they look best in fig legend)

key_dict={'Bp':'Bryopsis','E':'egg','V':'veliger','L':'crawling larvae','AS':'juvenile slugs'}
#Veliger	Bryopsis	Egg	Juvenile_slug	Larval_slug

name_dict={'#bdbdbd':'Bryopsis','#edf8fb':'   egg','#99d8c9':' veliger',
           '#41ae76':'   larvae','#005824':'  adult'} #dict of hexcolor:condition name

palette={'Bp':'#bdbdbd','E':'#edf8fb', 'V':'#99d8c9','L':'#41ae76','AS':'#005824'} #corresponding colors to each condition

pallist=list(palette.values())
sns.palplot(list(palette.values())) #display colors in palette
ax=plt.gca()
#add names of color/group
for i, name in enumerate(pallist):
#     print(i,name_dict[name])
    ax.text(i-.4, -0.8, name_dict[name], fontsize=10) 
plt.show()

markerdict={'B':'*','V':'v','L':'D','E':'o','A':'s'} #default assign markers to each condition

outdir=''

In [None]:
def confidence_ellipse(x, y, ax, n_std=1.5, facecolor='none', **kwargs):
    if x.size!=y.size:
        print('x and y must be the same size')

    cov=np.cov(x, y) #get covariance
    pearson=cov[0, 1]/np.sqrt(cov[0, 0] * cov[1, 1])
    #get eigenvalues of two-dimensionl dataset
    ell_radius_x=np.sqrt(1 + pearson)
    ell_radius_y=np.sqrt(1 - pearson)
    ellipse=Ellipse((0, 0), width=ell_radius_x * 2, height=ell_radius_y * 2, facecolor=facecolor, **kwargs)

    # calculate standard deviation of x from the squareroot of the variance and multiplying
    # by given number of standard deviations
    scale_x=np.sqrt(cov[0, 0]) * n_std
    mean_x=np.mean(x)

    # calculate standard deviation of y
    scale_y=np.sqrt(cov[1, 1]) * n_std
    mean_y=np.mean(y)
    transf=transforms.Affine2D().rotate_deg(45).scale(scale_x, scale_y).translate(mean_x, mean_y)
    ellipse.set_transform(transf + ax.transData)
    return ax.add_patch(ellipse)

In [None]:
def plot_pcs(targets,palette,df,principal_Df,keep_conditions,markerdict):

    #dict defining PC variables for each plot: [1st PC (pca), 2nd PC (pcb), pca location in variance list, pcb location in variance list]
    plotdict={'PC1 v PC2':['1','2',0,1],
              'PC2 v PC3':['2','3',1,2],
             'PC3 v PC4': ['3','4',2,3]}

    # plotting PCs
    for key in plotdict.keys():
        print('plotting %s' % key)
        fig=plt.figure(figsize=(10,10))
        ax=plt.subplot(111)
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)

        custom_legend_data = []
        for target in targets:
            if target[0] in keep_conditions: #specify which conditions to keep for plot
                color=palette[target]
                markerid=target[0]
                custom_legend_data.append(Line2D([0], [0], marker=markerdict[target[0]], color='w', label=key_dict[target],
                                                 markerfacecolor=color, markersize=10, markeredgewidth=0.8, 
                                                 markeredgecolor='gray')) #build custom legend
                indicesToKeep=df.index[df['label']==target].tolist() #find index in array of the condition

                #plot scatterplot of just those indices (aka samples)
                pca='principal component '+plotdict[key][0]
                pcb='principal component '+plotdict[key][1]

                plt.scatter(principal_Df.loc[indicesToKeep, pca], principal_Df.loc[indicesToKeep, pcb],
                            s=100, label=target, marker=markerdict[target[0]], c=color, alpha=.8,
                            edgecolor='gray', linewidth=0.8)
                #plot ellipse around each condition
                if len(keep_conditions)<3: #if displaying <3 conditions, make ellipses filled with color
                    confidence_ellipse(principal_Df.loc[indicesToKeep, pca], principal_Df.loc[indicesToKeep,pcb],
                                       ax, facecolor=color, edgecolor='gray', linewidth=2, alpha=.3, zorder=0)
                else: #if all conditions, only show outline with color
                    confidence_ellipse(principal_Df.loc[indicesToKeep, pca], principal_Df.loc[indicesToKeep, pcb],
                                       ax, edgecolor=color, linewidth=3, alpha=.8, zorder=0)

        box=ax.get_position()#shrink axis height by 10% on bottom
        ax.set_position([box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9]) 

        legend_cols=len(keep_conditions)
        ax.legend(handles=custom_legend_data, loc='upper center', bbox_to_anchor=(0.5, -0.1),
                 fancybox=True, ncol=legend_cols) #put legend below current axis 

        #get variance explained for each PC
        pcva=plotdict[key][2]
        pcvb=plotdict[key][3]
        variance_pca,variance_pcb=pca_comp.explained_variance_ratio_[pcva]*100, pca_comp.explained_variance_ratio_[pcvb]*100

        #plot PC name and variance explained on the axes
        plt.xlabel('%s\n(%f %%)' % (pca,variance_pca), fontsize=20)
        plt.ylabel('%s\n(%f %%)' % (pcb,variance_pcb), fontsize=20)

        #make axes fit all datapoints and make them square
        maxpca=max(principal_Df[pca]) #axes are 0+/-n where n is maximum PC of the principal components being plotted
        maxpcb=max(principal_Df[pcb])
        max_axis_size = max(maxpca, maxpcb) #take the larger of the two axes
        lim=math.ceil(max_axis_size/100)*100 #round axis up (ie ceiling) length to nearest 100
        print('max_axis_size',max_axis_size)
        print('lim',lim)
        plt.xlim(-lim,lim)
        plt.ylim(-lim,lim)
        plt.tight_layout()

        keyname=key.replace(' ','') #remove spaces from key so that can be included in output figure name
        conds=''
        for x in keep_conditions: #make string of conditions to include in save name
            conds+=x

#         plt.savefig('%sElysia_Bryopsis_PC%svsPC%s.svg' % (outdir,pca[-1],pcb[-1]), format='svg', bbox_inches='tight')
        print('plot saved as: %sElysia_Bryopsis_PC%svsPC%s.svg' % (outdir,pca[-1],pcb[-1]))
        plt.show()

        




In [None]:
keep_conditions=['B','E','V','L','A'] #conditions to include

plot_pcs(targets,palette,df,principal_Df,keep_conditions,markerdict)

In [None]:
#PCA of with only Elysia samples

In [None]:
#link samples
sampleConditionDict = {}

for line in open(sampleConditionFile, 'r'):
    line=line.rstrip().split('\t')
    if 'sample' in line[0]: #skip header
        continue
    sample,condition=line[0],line[1] #get samples, conditions from columns 1 and 2
    if 'Bp' not in sample: #skip Bryopsis
        sampleConditionDict[sample]=condition #store in dictionary with sample as key

print('%i sample-condition pairs stored in sampleConditionDict\n' % len(sampleConditionDict.keys()))
sampleConditionDict
df=pd.read_csv(expFile, header=0, sep='\t').transpose() #bring in gene expression matrix and transpose
df=df[~df.index.str.startswith('Bp')] #remove bryopsis 

df=df.loc[:,(df!=0).any(axis=0)] #drop columns with all 0s-- fixes issue with standard dev in next block

display(df) #check dataframe


In [None]:
samples,conditions=[],[] #store list of all samples and conditions

for sample in df.index: #go through each sample in df
    if sample not in sampleConditionDict.keys(): #if sample not in samples/conditions file
        df.drop([sample],inplace=True) #delete from the matrix
        continue
    condition=sampleConditionDict[sample] #otherwise, store it
    conditions.append(condition)
    samples.append(sample)

print('%i unique conditions:' % len(set(conditions))) #check number of conditions is 16
print('\t'.join(set(conditions))+'\n') #print conditions
features=np.asarray(df.columns) #array of all genes in the dataframe
features
print('%i unique genes' % len(set(features)))

#standardscale dataframe
x=df.loc[:, features].values #store features (genes) as x
x=StandardScaler().fit_transform(x) #normalize features
x.shape #check shape of dataframe- first value=number of samples, second value=number of genes in matrix

#check if the mean=0 and std=1
print('mean: %s (%f)\tstandard deviation: %s (%f)' % (np.mean(x),round(np.mean(x)),np.std(x),round(np.std(x))))



In [None]:
feat_cols=['gene'+str(i) for i in range(x.shape[1])] #rename columns to 'gene'+number

normalised_df=pd.DataFrame(x,columns=feat_cols) #convert normalized data to a dataframe
display(normalised_df) #check dataframe


In [None]:
df.reset_index(inplace=True, drop=True) #change index to numbers
df.head() #check dataframe

In [None]:
#run PCA with n dimensions- PCA with all components

n_components=12 #number of PCs

columns=[]
for i in range(n_components): #add principal componenets to columns
    columns.append('principal component %s' % str(i+1))

pca_comp=PCA(n_components=n_components) 
principalComponents_comp=pca_comp.fit_transform(x) #transform PCA variables
print('Explained variation per principal component: {}'.format(pca_comp.explained_variance_ratio_)) #explained variance

principal_Df=pd.DataFrame(data=principalComponents_comp, columns=columns) #store outputs as df for plotting
df['label']=conditions #add label column
df['sample']=samples #add sample column
display(df.head())

#build scree plot
ind=np.arange(0, n_components) #range 1-n_components
(fig, ax) = plt.subplots(figsize=(8, 6)) #define figure size
sns.pointplot(x=ind, y=pca_comp.explained_variance_ratio_) #plot explained variance for each PC
ax.set_title('Scree plot')
ax.set_xticks(ind)
ax.set_xticklabels(i+1 for i in ind) #correct PC number
ax.set_xlabel('Component Number')
ax.set_ylabel('Explained Variance')
plt.show()

In [None]:
#Plotting settings
# print(set(conditions)) #use this to get list for targets (unordered)
sns.set_theme(style='white') #set background to white

targets=['E','V','L','AS'] #conditions in order (so they look best in fig legend)

key_dict={'Bp':'Bryopsis','E':'egg','V':'veliger','L':'larvae','AS':'adult'}

name_dict={'#edf8fb':'   egg','#99d8c9':' veliger',
           '#41ae76':'  larvae','#005824':'  adult'} #dict of hexcolor:condition name

palette={'E':'#edf8fb', 'V':'#99d8c9','L':'#41ae76','AS':'#005824'} #corresponding colors to each condition

pallist=list(palette.values())
sns.palplot(list(palette.values())) #display colors in palette
ax=plt.gca()
#add names of color/group
for i, name in enumerate(pallist):
#     print(i,name_dict[name])
    ax.text(i-.4, -0.8, name_dict[name], fontsize=10) 
plt.show()

markerdict={'V':'v','L':'D','E':'o','A':'s'} #default assign markers to each condition


In [None]:
def confidence_ellipse(x, y, ax, n_std=1.5, facecolor='none', **kwargs):
    if x.size!=y.size:
        print('x and y must be the same size')

    cov=np.cov(x, y) #get covariance
    pearson=cov[0, 1]/np.sqrt(cov[0, 0] * cov[1, 1])
    #get eigenvalues of two-dimensionl dataset
    ell_radius_x=np.sqrt(1 + pearson)
    ell_radius_y=np.sqrt(1 - pearson)
    ellipse=Ellipse((0, 0), width=ell_radius_x * 2, height=ell_radius_y * 2, facecolor=facecolor, **kwargs)

    # calculate standard deviation of x from the squareroot of the variance and multiplying
    # by given number of standard deviations
    scale_x=np.sqrt(cov[0, 0]) * n_std
    mean_x=np.mean(x)

    # calculate standard deviation of y
    scale_y=np.sqrt(cov[1, 1]) * n_std
    mean_y=np.mean(y)
    transf=transforms.Affine2D().rotate_deg(45).scale(scale_x, scale_y).translate(mean_x, mean_y)
    ellipse.set_transform(transf + ax.transData)
    return ax.add_patch(ellipse)

In [None]:
def plot_pcs(targets,palette,df,principal_Df,keep_conditions,markerdict):

    #dict defining PC variables for each plot: [1st PC (pca), 2nd PC (pcb), pca location in variance list, pcb location in variance list]
    plotdict={'PC1 v PC2':['1','2',0,1],
              'PC2 v PC3':['2','3',1,2],
             'PC3 v PC4': ['3','4',2,3]}

    # plotting PCs
    for key in plotdict.keys():
        print('plotting %s' % key)
        fig=plt.figure(figsize=(10,10))
        ax=plt.subplot(111)
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)

        custom_legend_data = []
        for target in targets:
            if target[0] in keep_conditions: #specify which conditions to keep for plot
                color=palette[target]
                markerid=target[0]
                custom_legend_data.append(Line2D([0], [0], marker=markerdict[target[0]], color='w', label=key_dict[target],
                                                 markerfacecolor=color, markersize=10, markeredgewidth=0.8, 
                                                 markeredgecolor='gray')) #build custom legend
                indicesToKeep=df.index[df['label']==target].tolist() #find index in array of the condition

                #plot scatterplot of just those indices (aka samples)
                pca='principal component '+plotdict[key][0]
                pcb='principal component '+plotdict[key][1]

                plt.scatter(principal_Df.loc[indicesToKeep, pca], principal_Df.loc[indicesToKeep, pcb],
                            s=100, label=target, marker=markerdict[target[0]], c=color, alpha=.8,
                            edgecolor='gray', linewidth=0.8)
                #plot ellipse around each condition
                if len(keep_conditions)<3: #if displaying <3 conditions, make ellipses filled with color
                    confidence_ellipse(principal_Df.loc[indicesToKeep, pca], principal_Df.loc[indicesToKeep,pcb],
                                       ax, facecolor=color, edgecolor='gray', linewidth=2, alpha=.3, zorder=0)
                else: #if all conditions, only show outline with color
                    confidence_ellipse(principal_Df.loc[indicesToKeep, pca], principal_Df.loc[indicesToKeep, pcb],
                                       ax, edgecolor=color, linewidth=3, alpha=.8, zorder=0)

        box=ax.get_position()#shrink axis height by 10% on bottom
        ax.set_position([box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9]) 

        legend_cols=len(keep_conditions)
        ax.legend(handles=custom_legend_data, loc='upper center', bbox_to_anchor=(0.5, -0.1),
                 fancybox=True, ncol=legend_cols) #put legend below current axis 

        #get variance explained for each PC
        pcva=plotdict[key][2]
        pcvb=plotdict[key][3]
        variance_pca,variance_pcb=pca_comp.explained_variance_ratio_[pcva]*100, pca_comp.explained_variance_ratio_[pcvb]*100

        #plot PC name and variance explained on the axes
        plt.xlabel('%s\n(%f %%)' % (pca,variance_pca), fontsize=20)
        plt.ylabel('%s\n(%f %%)' % (pcb,variance_pcb), fontsize=20)

        #make axes fit all datapoints and make them square
        maxpca=max(principal_Df[pca]) #axes are 0+/-n where n is maximum PC of the principal components being plotted
        maxpcb=max(principal_Df[pcb])
        max_axis_size = max(maxpca, maxpcb) #take the larger of the two axes
        lim=math.ceil(max_axis_size/100)*100+20 #round axis up (ie ceiling) length to nearest 100
        print('max_axis_size',max_axis_size)
        print('lim',lim)
        plt.xlim(-lim,lim)
        plt.ylim(-lim,lim)
        plt.tight_layout()

        keyname=key.replace(' ','') #remove spaces from key so that can be included in output figure name
        conds=''
        for x in keep_conditions: #make string of conditions to include in save name
            conds+=x

        plt.savefig('%sElysia_PC%svsPC%s.svg' % (outdir,pca[-1],pcb[-1]), format='svg', bbox_inches='tight')
        print('%sElysia_PC%svsPC%s.svg' % (outdir,pca[-1],pcb[-1]))
        
        plt.show()

        



In [None]:
# keep_conditions=['E','V'] #conditions to include

# plot_pcs(targets,palette,df,principal_Df,keep_conditions,markerdict)

In [None]:
keep_conditions=['E','V','L','A'] #conditions to include

plot_pcs(targets,palette,df,principal_Df,keep_conditions,markerdict)