In [1]:
import pandas as pd
import numpy as np
import random as rd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.decomposition import PCA

In [2]:
genes = ['gene' + str(i) for i in range(1,101)]
wt = ['wt' + str(i) for i in range(1,6)]
ko = ['ko' + str(i) for i in range(1,6)]

In [3]:
data = pd.DataFrame(columns=[*wt, *ko], index=genes)

In [4]:
for gene in data.index:
    data.loc[gene, 'wt1' : 'wt5'] = np.random.poisson(lam=rd.randrange(10,1000), size=5)
    data.loc[gene, 'ko1' : 'ko5'] = np.random.poisson(lam=rd.randrange(10,1000), size=5)

In [5]:
print(data.head())

       wt1  wt2  wt3  wt4  wt5  ko1  ko2  ko3   ko4  ko5
gene1  753  782  831  810  781  800  796  805   731  730
gene2  609  616  600  608  632  499  578  568   529  542
gene3  447  381  408  446  439  976  948  953  1010  993
gene4   56   49   64   56   57  653  630  623   667  634
gene5  105  106   96  115  124  240  261  220   234  229


In [6]:
print(data.shape)

(100, 10)


In [7]:
scaled_data = preprocessing.scale(data.T)

In [8]:
pca = PCA()
pca.fit(scaled_data)
pca_data = pca.transform(scaled_data)

In [9]:
per_var = np.round(pca.explained_variance_ratio_* 100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]

In [10]:
plt.bar(x=range(1, len(per_var)+1), height=per_var, tick_label=labels)
plt.ylabel('Percentage of Explainded Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.show()

In [22]:
plt.show()