# Protein Family Classification

## Task
    
Use your ProtVec embedding from homework 5 to perform protein family classification using RNN.

Article with the original research can be found here http://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0141287&type=printable

* use 1000 most frequent families for classification
* validate your results on the train-test split
* reduce the dimensionality of the protein-space using Stochastic Neighbor Embedding and visualize two most frequent classes
* compare your RNN results with SVM
* visualization and metrics are up to you

In [33]:
import os
import pickle
import numpy as np
import pandas as pd

from collections import Counter

## Data Reading

In [2]:
data_df = pd.read_table('../seminar05/data/family_classification_metadata.tab')
seq_df = pd.read_table('../seminar05/data/family_classification_sequences.tab')
vec_df = pd.read_csv('protVec_100d_3grams.csv', header=None)

In [36]:
len(data_df), len(seq_df)

(324018, 324018)

In [3]:
data_df.head()

Unnamed: 0,SwissProtAccessionID,LongID,ProteinName,FamilyID,FamilyDescription
0,Q6GZX4,001R_FRG3G,Putative transcription factor 001R,Pox_VLTF3,Poxvirus Late Transcription Factor VLTF3 like
1,Q6GZX3,002L_FRG3G,Uncharacterized protein 002L,DUF230,Poxvirus proteins of unknown function
2,Q6GZX0,005R_FRG3G,Uncharacterized protein 005R,US22,US22 like
3,Q91G88,006L_IIV6,Putative KilA-N domain-containing protein 006L,DUF3627,Protein of unknown function (DUF3627)
4,Q197F3,007R_IIV3,Uncharacterized protein 007R,DUF2738,Protein of unknown function (DUF2738)


In [4]:
data_df.describe()

Unnamed: 0,SwissProtAccessionID,LongID,ProteinName,FamilyID,FamilyDescription
count,324018,324018,324018,324018,324018
unique,287308,295671,56951,7027,6967
top,Q1X881,POLG_JAEVJ,UvrABC system protein B,MMR_HSR1,50S ribosome-binding GTPase
freq,16,12,1500,3084,3084


In [5]:
seq_df.head()

Unnamed: 0,Sequences
0,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...
1,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...
2,MQNPLPEVMSPEHDKRTTTPMSKEANKFIRELDKKPGDLAVVSDFV...
3,MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGK...
4,MEAKNITIDNTTYNFFKFYNINQPLTNLKYLNSERLCFSNAVMGKI...


In [6]:
vec_df.head()

Unnamed: 0,0
0,AAA\t-0.17406\t-0.095756\t0.059515\t0.039673\t...
1,ALA\t-0.114085\t-0.093288\t0.1558\t-0.037351\t...
2,LLL\t-0.075594\t-0.100834\t-0.046616\t-0.20898...
3,LAA\t-0.137546\t-0.135425\t0.121566\t-0.038295...
4,AAL\t-0.156112\t-0.133524\t0.114426\t-0.020264...


## Data Processing

In [57]:
top1k_family2num = {c: i for i, (c, _) in enumerate(Counter(data_df['FamilyID']).most_common(1000))}
top1k_inds = np.array([i for i, f in enumerate(data_df['FamilyID']) if f in top1k_family2num])
len(top1k_family2num), len(top1k_inds), list(top1k_family2num.items())[:4]

(1000,
 261149,
 [('MMR_HSR1', 0), ('Helicase_C', 1), ('ATP-synt_ab', 2), ('7tm_1', 3)])

In [59]:
def make_codones(sseq):
    crop = len(sseq) % 3
    cropped_seq = sseq[:-crop] if crop > 0 else sseq
    return [cropped_seq[i:i+3] for i in range(0, len(cropped_seq), 3)]


def seq_to3(seq):
    splittings = [make_codones(seq[i:]) for i in range(3)]
    return splittings


def create_codone2vec():
    return {row[1][0].split('\t', maxsplit=1)[0]: list(map(float, row[1][0].split('\t')[1:]))
            for row in vec_df.iterrows()}


def create_data():
    codone2vec = create_codone2vec()
    
    def get_vec(x):
        return codone2vec[x] if x in codone2vec else codone2vec['<unk>']
    
    ys = data_df['FamilyID']
    X, y = [], []
    for i, (row,) in seq_df.iloc[top1k_inds].iterrows():
        for seq3 in seq_to3(row):
            X.append(np.array([get_vec(x) for x in seq3]))
            y.append(ys[i])

    return X, np.array(y)

In [60]:
def read_or_create(read_path, producer):
    if os.path.isfile(read_path):
        print('reading', read_path)
        with open(read_path, 'rb') as fp:
            return pickle.load(fp)
    result = producer()
    print('saving', read_path)
    with open(read_path, 'wb') as fp:
        pickle.dump(result, fp)
    return result

In [None]:
raw_X, raw_y = read_or_create(read_path='data/raw_data.pickle',
                             producer=lambda: create_data())

## Model

In [None]:
class FamilyClassifierRNNModel:
    pass