In [8]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_classif

In [2]:
gene_feats = pd.read_csv("ClusterDataset.csv")
gene_feats.head()

Unnamed: 0,Name,ENSG00000244734.3,ENSG00000210082.2,ENSG00000198804.2,ENSG00000198712.1,ENSG00000198938.2,ENSG00000188536.12,ENSG00000198899.2,ENSG00000198886.2,ENSG00000275896.5,...,ENSG00000176749.8,ENSG00000241973.10,ENSG00000118689.14,ENSG00000167680.15,ENSG00000148053.15,ENSG00000134291.11,ENSG00000183578.6,ENSG00000164091.11,ENSG00000173418.11,tissue
0,GTEX-1117F-0226-SM-5GZZ7,452.7,6310.0,10790.0,11720.0,19890.0,102.4,13880.0,12400.0,0.0,...,2.029,31.85,62.17,24.53,18.29,17.72,45.8,76.38,61.47,Adipose Tissue
1,GTEX-1117F-0426-SM-5EGHI,225.7,10200.0,33610.0,37500.0,62560.0,52.01,51690.0,34030.0,46.66,...,1.664,14.18,110.9,9.736,0.2841,8.708,0.7015,44.89,80.3,Muscle
2,GTEX-1117F-0526-SM-5EGHJ,269.6,6111.0,9689.0,12250.0,19450.0,62.95,16270.0,13820.0,159.2,...,1.641,30.01,73.15,33.44,16.64,33.61,77.17,62.28,46.12,Blood Vessel
3,GTEX-1117F-0626-SM-5N9CS,5272.0,11990.0,4269.0,10630.0,16440.0,1323.0,16350.0,11990.0,3.747,...,1.895,39.19,60.41,29.95,21.37,58.98,38.11,74.41,51.21,Blood Vessel
4,GTEX-1117F-0726-SM-5GIEN,1617.0,40440.0,56700.0,39220.0,64660.0,403.5,82550.0,69350.0,9.192,...,0.6289,15.62,20.16,14.0,4.511,23.49,5.675,31.22,22.02,Heart


### Remove gene a label columns. Normalize Data useing standard scaler

In [35]:
labels = gene_feats["tissue"].to_list()

In [None]:
#Get labels
labels = gene_feats["tissue"].to_list()
#Remove gene column and features column convert to numpy array
gene_array = gene_feats.iloc[:,1:-1].to_numpy()
norm_genes = StandardScaler().fit_transform(gene_array)
gene_names = gene_feats.columns[1:-1] #get only gene names remove Name and labels columns

### Based on ClusterAssignment 

There are no particularly strong clusters from internal metrics

Use cluster of 10 since we know there are 10 tissues and evaluate if there are any genes driving the clusters

In [None]:
#Obtain clusters
km = KMeans(n_clusters=10, n_init="auto", random_state=0,algorithm='elkan')
labels_km = km.fit_predict(norm_genes)

### Use one way ANOVA for each gene to 10 Kmeans Clusters

One way ANOVA was chosen because it separates between group vs. within group variation. I have many groups (either the clustering or the tissue category) and genes are continuouis variables.



If a gene has a large F value, then it varies a lot across clusters. 

Sorting by F value is the same as sorting by pvalue because large F = smaller pvalue.

This would suggest it is a driver of cluster separation

In [19]:
f_vals, p_vals = f_classif(norm_genes,labels_km)
f_series = pd.Series(f_vals, index=gene_names)
top_n = 20
top_genes_clusters = f_series.sort_values(ascending=False).head(top_n)
top_genes_clusters

ENSG00000244734.3     5.778444e+08
ENSG00000210082.2     1.473324e+08
ENSG00000198804.2     8.912685e+07
ENSG00000198712.1     7.237725e+07
ENSG00000198938.2     6.118706e+07
ENSG00000198899.2     5.953502e+07
ENSG00000188536.12    5.532686e+07
ENSG00000198886.2     5.136573e+07
ENSG00000171401.14    3.591510e+07
ENSG00000186395.7     2.897677e+07
ENSG00000198888.2     2.856604e+07
ENSG00000163220.10    2.704641e+07
ENSG00000275896.5     2.626163e+07
ENSG00000198763.3     2.420453e+07
ENSG00000198727.2     2.117964e+07
ENSG00000259384.6     1.921000e+07
ENSG00000204983.13    1.852833e+07
ENSG00000211459.2     1.770776e+07
ENSG00000143632.14    1.703461e+07
ENSG00000228253.1     1.607809e+07
dtype: float64

### Get Gene descriptions for interpretation of findings

In [None]:
#Download Data here: https://gtexportal.org/home/downloads/adult-gtex/bulk_tissue_expression
Gene = pd.read_csv("GTEx_Analysis_20170605_v8_RNASeQCv1.1.9_gene_tpm.gct",sep ="\t",skiprows=2,engine="c")

In [27]:
get_description = dict(zip(Gene["Name"],Gene["Description"]))

In [30]:
t20genes={}
for g in top_genes_clusters.index:
    t20genes[g] = get_description[g]
t20genes

{'ENSG00000244734.3': 'HBB',
 'ENSG00000210082.2': 'MT-RNR2',
 'ENSG00000198804.2': 'MT-CO1',
 'ENSG00000198712.1': 'MT-CO2',
 'ENSG00000198938.2': 'MT-CO3',
 'ENSG00000198899.2': 'MT-ATP6',
 'ENSG00000188536.12': 'HBA2',
 'ENSG00000198886.2': 'MT-ND4',
 'ENSG00000171401.14': 'KRT13',
 'ENSG00000186395.7': 'KRT10',
 'ENSG00000198888.2': 'MT-ND1',
 'ENSG00000163220.10': 'S100A9',
 'ENSG00000275896.5': 'PRSS2',
 'ENSG00000198763.3': 'MT-ND2',
 'ENSG00000198727.2': 'MT-CYB',
 'ENSG00000259384.6': 'GH1',
 'ENSG00000204983.13': 'PRSS1',
 'ENSG00000211459.2': 'MT-RNR1',
 'ENSG00000143632.14': 'ACTA1',
 'ENSG00000228253.1': 'MT-ATP8'}

### Interpretation

Top gene is hemoglobin beta which makes sense especially since this likely clusters blood tissue.

Hemoglobin alpha 2 is also in the top genes, likely for a similar reason.

The majority of top genes are mitochondrial related which may be related to differences in metabolic demand between tissues. For example, red blood cells do not have mitochondria so lacking this expression could drive that group. 

Another reason is this may just reflect differences in the stress each sample was in when it was being processed. I would expect more degraded samples to have higher mitochondrial reads. 


### Use one way ANOVA for each Tissue label 

In [32]:
f_vals, p_vals = f_classif(norm_genes,labels)
f_series = pd.Series(f_vals, index=gene_names)
top_n = 20
top_genes_clusters = f_series.sort_values(ascending=False).head(top_n)
top_genes_clusters

ENSG00000167476.10    8154.952866
ENSG00000138347.15    6755.844628
ENSG00000169862.18    6404.067451
ENSG00000079393.20    5758.511665
ENSG00000058056.8     5641.304562
ENSG00000178104.19    5529.093124
ENSG00000104369.4     5303.005845
ENSG00000154358.20    5281.136270
ENSG00000143028.8     5276.027456
ENSG00000133454.15    5081.811567
ENSG00000133315.10    4977.701651
ENSG00000164776.9     4925.772189
ENSG00000163126.14    4786.690601
ENSG00000229444.1     4774.072717
ENSG00000132692.18    4711.212088
ENSG00000174939.10    4438.099903
ENSG00000143164.15    4391.094053
ENSG00000087258.14    4290.132432
ENSG00000163380.15    4090.750569
ENSG00000081248.10    4075.215437
dtype: float64

In [33]:
t20genes_tissue={}
for g in top_genes_clusters.index:
    t20genes_tissue[g] = get_description[g]
t20genes_tissue

{'ENSG00000167476.10': 'JSRP1',
 'ENSG00000138347.15': 'MYPN',
 'ENSG00000169862.18': 'CTNND2',
 'ENSG00000079393.20': 'DUSP13',
 'ENSG00000058056.8': 'USP13',
 'ENSG00000178104.19': 'PDE4DIP',
 'ENSG00000104369.4': 'JPH1',
 'ENSG00000154358.20': 'OBSCN',
 'ENSG00000143028.8': 'SYPL2',
 'ENSG00000133454.15': 'MYO18B',
 'ENSG00000133315.10': 'MACROD1',
 'ENSG00000164776.9': 'PHKG1',
 'ENSG00000163126.14': 'ANKRD23',
 'ENSG00000229444.1': 'RP11-184I16.4',
 'ENSG00000132692.18': 'BCAN',
 'ENSG00000174939.10': 'ASPHD1',
 'ENSG00000143164.15': 'DCAF6',
 'ENSG00000087258.14': 'GNAO1',
 'ENSG00000163380.15': 'LMOD3',
 'ENSG00000081248.10': 'CACNA1S'}

### Interpretation

Spot checking on gene cards these are generally genes associated either with skeletal muscle such as JSRP1 (sarcoplasmic reticulum in skeletal muscle), MYPN (Z-disc/I-band sarcomeric protein) or neurons such as CTNND2 (synaptic protein in neurons), BCAN (component of CNS extracellular matrix).