# PML Project


Update Date: 2022-01-08

Author: MAO Xiqing

Content: Task 2.1(baseline)

##2.1 The Baseline: A site-indep model 

### Get sequence

In [None]:
%%capture
!wget https://sid.erda.dk/share_redirect/a5PTfl88w0/BLAT_ECOLX_1_b0.5_labeled.fasta    
!wget https://sid.erda.dk/share_redirect/a5PTfl88w0/BLAT_ECOLX_Ranganathan2015.csv
!pip install biopython

import helper
import numpy as np
import torch
from torch.optim import Adam
from torch import nn
from tqdm import tqdm
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from sklearn.manifold import TSNE
import pandas as pd
from collections import Counter

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [40]:
seqs, labels, weights, phyla_lookup_table, phyla_idx, dataset=helper.get_baseline_data('BLAT_ECOLX_1_b0.5_labeled.fasta',calc_weights=True)
experimental_data = helper.read_experimental_data("BLAT_ECOLX_Ranganathan2015.csv", dataset)
dataloader_weighted = torch.utils.data.DataLoader(dataset, batch_size=16, sampler=torch.utils.data.sampler.WeightedRandomSampler(weights, num_samples=len(dataset)))

### Baseline

In [8]:
from collections import Counter
def my_log(data):
  if data > 0:
    result = np.log(data)
  else:
    result = 0
  return result

class base1ine(object):
    def __init__(self, pseudo_count = 1):
        self.pseudo_count = pseudo_count
        self.freqs = []

    def get_freqs(self, seq_data):
        for position in range(seq_data.shape[1]):
            freq_aa_in_position = {}
            aa_in_position = seq_data[:, position]
            count_aa = Counter(aa_in_position)
            num_aa = len(np.unique(aa_in_position))
            for i in range(23):
                freq_aa_in_position[i] = (count_aa[i] + self.pseudo_count) / (seq_data.shape[0] + num_aa * self.pseudo_count)
            self.freqs.append(freq_aa_in_position)

    def get_P_of_seqs(self, seq_data):   
        P_of_seqs = []
        for seq in seq_data:
            P_of_seq= 0
            for i, aa in enumerate(seq):
                P_of_seq += my_log(self.freqs[i][aa])
            P_of_seqs.append(P_of_seq)
            P_of_seqs = np.array(P_of_seqs)
        return P_of_seqs

In [9]:
baseline = base1ine(pseudo_count=1)

### Result

In [70]:
baseline.get_freqs(seqs)

In [63]:
import copy
raw_sequence = [seqs[0]]
log_P_of_wt = baseline.get_P_of_seqs(raw_sequence)
experiment_value = []
predicted_value = []
for (position, mutant_from), row in experimental_data.iterrows():
    assert helper.aa1_to_index[mutant_from] == raw_sequence[0][position]
    for mutant_to, exp_value in row.iteritems():
        if mutant_to != mutant_from:
            new_sequence = copy.deepcopy(raw_sequence)
            new_sequence[0][position] = helper.aa1_to_index[mutant_to]
            experiment_value.append(exp_value)
            log_P_of_mt = baseline.get_P_of_seqs(new_sequence)
            predicted_value.append(-(log_P_of_wt - log_P_of_mt))

In [64]:
from scipy.stats import spearmanr
spearmanr(experiment_value, predicted_value)

SpearmanrResult(correlation=0.6061777186788756, pvalue=0.0)

### Weighted result

In [74]:
weighed_seqs=[]
for i in dataloader_weighted:
  weighed_seqs.append(i[0][0].cpu().detach().numpy())
weighed_seqs = np.array(weighed_seqs)

In [81]:
baseline_weighted = base1ine(pseudo_count=1)
baseline_weighted.get_freqs(weighed_seqs)

In [84]:
import copy
raw_sequence = [seqs[0]]
log_P_of_wt = baseline_weighted.get_P_of_seqs(raw_sequence)
experiment_value = []
predicted_value = []
for (position, mutant_from), row in experimental_data.iterrows():
    assert helper.aa1_to_index[mutant_from] == raw_sequence[0][position]
    for mutant_to, exp_value in row.iteritems():
        if mutant_to != mutant_from:
            new_sequence = copy.deepcopy(raw_sequence)
            new_sequence[0][position] = helper.aa1_to_index[mutant_to]
            experiment_value.append(exp_value)
            log_P_of_mt = baseline_weighted.get_P_of_seqs(new_sequence)
            predicted_value.append(-(log_P_of_wt - log_P_of_mt))

In [85]:
from scipy.stats import spearmanr
spearmanr(experiment_value, predicted_value)

SpearmanrResult(correlation=0.5960609586606334, pvalue=0.0)