In [None]:
import numpy as np 
import pandas as pd
from sklearn.decomposition import PCA
import plot_utils as plu
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import plot_utils as plu

## PCA

### Compute PCs

#### Using scikit-learn regular PCA
- combined_PCA: all datasets are merged and PCA is computed once
- coupled_PCA: for each generated dataset, merge with the real dataset and compute PCA
- independent_PCA: PCA on each dataset separately (not even merged with real)


For each type of PCA, output:
- separated scatterplots
- superpose scatterplot (generated overlayed on top of gray Real)
- density plots of the PC scores
- computing distance between real and generated scores (eg wasserstein distance)

In [None]:
print()
print("#################### PCA ####################")
print("- Computing and plotting PCA...")

In [None]:
scorelist = []
ncomp=6 # change to compute more PCs

In [None]:
if matching_SNPs:
    methodname="Combined PCA"
    method='combined_PCA'
    print(f'    - Computing and plotting {methodname} ...')
    pca = PCA(n_components=ncomp)
    pcs = pca.fit_transform(
        np.concatenate(list(datasets.values()))
        )
    pcdf = pd.DataFrame(pcs, columns=["PC{}".format(x+1) for x in np.arange(pcs.shape[1])] )
    pcdf["label"] = sampleinfo.label.astype('category')
    plu.plotPCAallfigs(pcdf, methodname, orderedCat=infiles.keys(), outDir=outDir, colpal=colpal)
    plt.show()

In [None]:

methodname="Coupled PCA"
method='coupled_PCA'
print(f'Computing {methodname} ...')
nReal = datasets['Real'].shape[0]
pcdf=pd.DataFrame()
for cat in categ:
    pca = PCA(n_components=ncomp)
    pcs = pca.fit_transform(
        np.concatenate([datasets['Real'],datasets[cat]])
    ) # PCA on combined Real + cat individuals
    #df = pd.DataFrame(pcs[nReal:,:], columns=["PC{}".format(x+1) for x in np.arange(pcs.shape[1])]) #keep only pc values for individuals in cat
    #df.insert(ncomp,'label',cat)
    df = pd.DataFrame(pcs, columns=["PC{}".format(x+1) for x in np.arange(pcs.shape[1])]) #keep only pc values for individuals in cat
    df['label']=np.concatenate([['Real']*nReal,[cat]*datasets[cat].shape[0]])
    df['coupled_with'] = cat
    pcdf = pd.concat( [pcdf, df], ignore_index=True)
    
# plot all PCA figures and compute KS 
plu.plotPCAallfigs(pcdf, methodname, orderedCat=infiles.keys(), outDir=outDir, colpal=colpal)
plt.show()
    

k = plu.computePCAdist(pcdf,method,outDir,stat='wasserstein')
scorelist.append(k)
print('Scorelist-wasserstein', k)
plt.show()

k = plu.computePCAdist(pcdf, method, outDir, stat='wasserstein2D',reg=1e-3) 
scorelist.append(k)
print('Scorelist-wasserstein2D', k)

In [None]:
if allchecks or not matching_SNPs: 
    # compute if not matching_SNPs even if not allchecks 
    # because combined or coupled PCA are not possible in this case
    methodname="Independent PCA"
    method='independent_PCA'
    print(f'Computing {methodname} ...')
    pcdf=pd.DataFrame()
    for cat in categ:
        pca = PCA(n_components=ncomp)
        pcs = pca.fit_transform(datasets[cat])
        df = pd.DataFrame(pcs, columns=["PC{}".format(x+1) for x in np.arange(pcs.shape[1])])
        df.insert(ncomp,'label',cat)
        pcdf = pd.concat( [pcdf, df], ignore_index=True)

    # plot all PCA figures  
    plu.plotPCAallfigs(pcdf, methodname, orderedCat=infiles.keys(), outDir=outDir, colpal=colpal)


In [None]:

scores_pca = pd.concat(scorelist, sort=False)
scores_pca.to_csv(outDir+'scores_all_PCA.csv')

# average scores (distances) accross PC axes
sc_sum_over_PCs = scores_pca.groupby(['method','stat','label'])['statistic'].sum()
sc_mean_over_PCs = scores_pca.groupby(['method','stat','label'])['statistic'].mean()
print(sc_mean_over_PCs)

In [None]:
print("#################### PCA DONE ####################")