In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
import os

from plotnine import *

## Overview
* select 5'UTRs longer than 80 nt
* count reads aligned to these UTRs (pysam)
* plot utr reads -bcm vs utr reads + bcm
* select UTRs with increased number of reads upon addition of BCM (clustering?)
* compare selected UTRs with genes upregulated in the stationary phase as discovered by DESeq2
* compare selected UTRs with small RNA binding sites (pybedtools?)

### Sample table and barcodes

In [None]:
# Sample titles with corresponding barcodes
samples = {
    's9': ['ATCACG', 'ACAGTG'],
    's9+bcm': ['CGATGT', 'GCCAAT'],
    's17': ['TTAGGC', 'GATCAG'],
    's17+bcm': ['TGACCA', 'TAGCTT'],
    's19': ['CAGATC','GGCTAC'],
    's19+bcm': ['ACTTGA', 'CTTGTA']
}

# Barcodes
barcodes = ['ATCACG', 'ACAGTG', 'CGATGT', 'GCCAAT', 'TTAGGC', 'GATCAG', 'TGACCA', 'TAGCTT', 'CAGATC','GGCTAC', 'ACTTGA', 'CTTGTA']

### Load counts for genes, calculate counts in UTRs longer than 80 nt

Gene counts were obtained using `htseq` program against the standard NC_000913 .GFF file The was I calculate reads in UTRs here is not strand-specific. So the numbers can be confounded if there is a transcript going in the opposite direction. We can solve this later if needed.

In [None]:
dfm = pd.read_csv('../../data/dfm.csv', sep='\t')
dfm

### Normalize counts for feature length, log-transform, and take means for replicates

Pseudo-counts (+1) are added during UTR reads counting to make sure we can log-transform the data.

In [None]:
id_vars = ['TSS','TU_name','coord_5','coord_3','gene', 'UTR_length']
value_vars = ['s9','s17','s19','s9+bcm','s17+bcm','s19+bcm']

dfn = dfm.copy()

# Normalize counts by gene and utr length
def norm_orf(barcode, rec):
    return float(rec[barcode] / abs(rec['first_gene_5'] - rec['first_gene_3']))

def norm_utr(barcode, rec):
    return float(rec['utr_{0}'.format(barcode)] / rec['UTR_length'])

for barcode in barcodes:
    dfn['orf_{0}'.format(barcode)] = dfn.apply(lambda rec: norm_orf(barcode, rec), axis=1)
    dfn['utr_{0}'.format(barcode)] = dfn.apply(lambda rec: norm_utr(barcode, rec), axis=1)

    
df = dfn[id_vars].copy()
# Take means across replicates according to the samples dict
for sample, bcs in samples.items():
    df['orf_{0}'.format(sample)] = np.log10(dfn[['orf_{0}'.format(b) for b in list(bcs)]].mean(axis=1))
    df['utr_{0}'.format(sample)] = np.log10(dfn[['utr_{0}'.format(b) for b in list(bcs)]].mean(axis=1))
df

### Plot wild type with vs without BCM

Two clusters are apparent. We are after the UTRs that are upregulated by the addition of BCM (cloud of points in the left part of the plot along y=0 line and in general (significantly) above y=x line).

BTW, the point size is the length of UTR. No (apparent) correlation here.

In [None]:
(ggplot(df, aes(x='utr_s9', y='utr_s9+bcm', size='UTR_length'))
        + geom_point(size=0.5, alpha=0.1)
        + geom_abline(slope=1, intercept=0, size=.5, color='#586e75')
)

In [None]:
(ggplot(df, aes(x='utr_s9', y='utr_s19', size='UTR_length'))
        + geom_point(size=0.5, alpha=0.1)
        + geom_abline(slope=1, intercept=0, size=0.5, color='#586e75')
)

### Clustering

Now we need a way to split the points the way we want. Let's try a bunch of clustering algorithms from `scikit-learn.`

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import euclidean_distances
from sklearn.neighbors import kneighbors_graph
from sklearn import cluster
from sklearn import mixture

X = df[['utr_s9', 'utr_s9+bcm']].to_numpy()
X = StandardScaler().fit_transform(X)

bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
connectivity = kneighbors_graph(X, n_neighbors=20)
connectivity = 0.05 * (connectivity + connectivity.T)
#distances = euclidean_distances(X)

gmm = mixture.GaussianMixture(n_components=2, covariance_type='full')

ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=2, batch_size=200)
kmeans = cluster.KMeans(n_clusters=2)
ward = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward', connectivity=connectivity)
spectral = cluster.SpectralClustering(n_clusters=2, n_neighbors=20, eigen_solver='arpack', affinity='nearest_neighbors')
dbscan = cluster.DBSCAN(eps=.5)
affinity_propagation = cluster.AffinityPropagation(damping=.95, preference=-200)
average_linkage = cluster.AgglomerativeClustering(linkage='average', affinity='cityblock', n_clusters=2, connectivity=connectivity)

for name, alg in [
                    ('MiniBatchKMeans', two_means),
                    ('KMeans', kmeans),
                    ('AffinityPropagation', affinity_propagation),
                    ('MeanShift', ms),
                    ('GMM', gmm),
                    ('SpectralClustering', spectral),
                    ('Ward', ward),
                    ('AgglomerativeClustering', average_linkage),
                    ('DBSCAN', dbscan)
                ]:
    alg.fit(X)
    if hasattr(alg, 'labels_'):
        df['label'] = alg.labels_.astype(np.int32)
    else:
        df['label'] = alg.predict(X)
    
    p = ggplot(df, aes(x='utr_s9', y='utr_s9+bcm', color='label')) \
        + geom_point(size=0.5, alpha=0.5) \
        + ggtitle(name) \
        + geom_abline(slope=1, intercept=0, size=0.5, color='#586e75')
    print(p)

In [None]:
X = df.as_matrix