In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import networkx
import h5py
#import obonet
from Bio import SeqIO
import re
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import gc
import pickle
import psutil
import torch
import joblib
from transformers import T5EncoderModel, T5Tokenizer
#from tape import ProteinBertModel, UniRepModel, TAPETokenizer

Getting the protein sequence from fasta files

In [None]:
id = []
seq = [] 

for seq_record in SeqIO.parse("/kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta", "fasta"):
    id.append(seq_record.id)
    seq.append(str(seq_record.seq))

print(len(id))

In [None]:
seq_df = pd.DataFrame({"id": id, "sequence": seq})
del id
del seq
seq_df.head()

In [None]:
gc.collect()

In [None]:
seq_df = seq_df.sort_values("id").reset_index().drop("index", axis = 1)
seq_df.head()

**Extract other features**

In [None]:
lst = []
for s in seq_df["sequence"]:
    lst.append(ProteinAnalysis(s).get_amino_acids_percent())
X = pd.DataFrame(lst)
X.head()

In [None]:
del lst

In [None]:
train_terms = pd.read_table("/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv")
train_terms.head()

In [None]:
# get most occured label
num_label = 1500

freqCount = (train_terms['term'].value_counts())
print(freqCount)
considered_one = list(freqCount.index[:num_label])

In [None]:
# make multilabel data
train_size = len(seq_df)
Y = np.zeros((train_size ,num_label))
train_protein = pd.Series(seq_df["id"])
train_terms_smaller = train_terms[train_terms["term"].isin(considered_one)]
for i in tqdm(range(Y.shape[1])):
    m = train_terms_smaller['term'] ==  considered_one[i]
    Y[:,i] =  train_protein.isin( set(train_terms_smaller[m]['EntryID'] ) ).astype(float )
Y

In [None]:
Y.shape

In [None]:
X = X.values
X.shape

In [None]:
del train_terms
del freqCount

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 101)

In [None]:
from sklearn.metrics import f1_score

**Binary Relevance Naive Bayes**

In [None]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

In [None]:
br_gnb = BinaryRelevance(GaussianNB())

In [None]:
br_gnb.fit(X_train, Y_train)
# 25 min to train if use embedding
# 1 min to train if use protein occurence

In [None]:
pred_br_gnb = br_gnb.predict(X_test)
# 19 min to test if use embedding
# 1 min to test if use protein occurence

In [None]:
print(f1_score(Y_test,pred_br_gnb.toarray(), average="micro")) 
print(f1_score(Y_test,pred_br_gnb.toarray(), average="macro"))

Label Powerset Naive Bayes

In [None]:
from skmultilearn.problem_transform import LabelPowerset

In [None]:
lp_gnb = LabelPowerset(GaussianNB())

In [None]:
lp_gnb.fit(X_train, Y_train)

In [None]:
pred_lp_gnb = lp_gnb.predict(X_test)

In [None]:
print(f1_score(Y_test,pred_lp_gnb.toarray(), average="micro")) 
print(f1_score(Y_test,pred_lp_gnb.toarray(), average="macro"))

In [None]:
del X
del Y
del X_train
del Y_train
del seq_df

Try on test data


In [None]:
test_id = []
test_seq = []

for seq_record in SeqIO.parse("/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset.fasta", "fasta"):
    test_id.append(seq_record.id)
    test_seq.append(str(seq_record.seq))

print(len(test_id))

In [None]:
test_df = pd.DataFrame({"id": test_id, "sequence": test_seq})
del test_id
del test_seq
test_df.head()

In [None]:
test_df = test_df.sort_values("id").reset_index().drop("index", axis = 1)
test_df.head()

In [None]:
id_lst = test_df["id"].tolist()

Make test data and remove all those unneeded or write the needed into disk in order to prevent ram overloading

In [None]:
test_lst = []
for s in test_df["sequence"]:
    test_lst.append(ProteinAnalysis(s).get_amino_acids_percent())
test = pd.DataFrame(test_lst)
test = test.values

In [None]:
del test_df
del test_lst

In [None]:
test.shape

In [None]:
test_sub1 = test[:30000]
test_sub2 = test[30000:60000]
test_sub3 = test[60000:90000]
test_sub4 = test[90000:120000]
test_sub5 = test[120000:]
del test

In [None]:
prob_1 = br_gnb.predict(test_sub1)
del test_sub1
with open('prob_1.pickle', 'wb') as f:
    pickle.dump(prob_1, f)
del prob_1

In [None]:
gc.collect()

In [None]:
prob_2 = br_gnb.predict(test_sub2)
del test_sub2
with open('prob_2.pickle', 'wb') as f:
    pickle.dump(prob_2, f)
del prob_2

In [None]:
gc.collect()

In [None]:
prob_3 = br_gnb.predict_proba(test_sub3)
del test_sub3
with open('prob_3.pickle', 'wb') as f:
    pickle.dump(prob_3, f)
del prob_3

In [None]:
gc.collect()

In [None]:
prob_4 = br_gnb.predict_proba(test_sub4)
del test_sub4
with open('prob_4.pickle', 'wb') as f:
    pickle.dump(prob_4, f)
del prob_4

In [None]:
gc.collect()

In [None]:
prob_5 = br_gnb.predict_proba(test_sub5)
del test_sub5
with open('prob_5.pickle', 'wb') as f:
    pickle.dump(prob_5, f)
del prob_5

In [None]:
gc.collect()

Now concat the data with the label

In [None]:
final_df = pd.DataFrame(columns = ["id", "terms", "prob"])

In [None]:
final_df["id"] = [id_lst[i] for i in range(30000) for _ in range(1500)]

In [None]:
final_df["terms"] = considered_one * 30000

In [None]:
with open('prob_1.pickle', 'rb') as f:
    prob_1 = pickle.load(f)
prob_1.shape

In [None]:
prob_1 = prob_1.toarray().ravel()

In [None]:
final_df["prob"] = prob_1

In [None]:
del prob_1

In [None]:
final_df = final_df[final_df["prob"] >= 0.6]

In [None]:
final_df.shape

In [None]:
gc.collect()

In [None]:
temp_df = pd.DataFrame(columns = ["id", "terms", "prob"])

In [None]:
temp_df["id"] = [id_lst[i] for i in range(30000, 60000) for _ in range(1500)]

In [None]:
temp_df["terms"] = considered_one * 30000

In [None]:
with open('prob_2.pickle', 'rb') as f:
    prob_2 = pickle.load(f)

In [None]:
prob_2 = prob_2.toarray().ravel()

In [None]:
temp_df["prob"] = prob_2

In [None]:
del prob_2

In [None]:
temp_df = temp_df[temp_df["prob"] >= 0.6]

In [None]:
temp_df.shape

In [None]:
final_df = pd.concat([final_df, temp_df])
del temp_df
final_df = final_df.reset_index().drop("index", axis = 1)

In [None]:
gc.collect()

In [None]:
temp_df = pd.DataFrame(columns = ["id", "terms", "prob"])

In [None]:
temp_df["id"] = [id_lst[i] for i in range(60000, 90000) for _ in range(1500)]

In [None]:
temp_df["terms"] = considered_one * 30000

In [None]:
with open('prob_3.pickle', 'rb') as f:
    prob_3 = pickle.load(f)

In [None]:
prob_3 = prob_3.toarray().ravel()

In [None]:
temp_df["prob"] = prob_3

In [None]:
del prob_3

In [None]:
temp_df = temp_df[temp_df["prob"] >= 0.6]

In [None]:
temp_df.shape

In [None]:
final_df = pd.concat([final_df, temp_df])
del temp_df
final_df = final_df.reset_index().drop("index", axis = 1)

In [None]:
gc.collect()

In [None]:
temp_df = pd.DataFrame(columns = ["id", "terms", "prob"])

In [None]:
temp_df["id"] = [id_lst[i] for i in range(90000, 120000) for _ in range(1500)]

In [None]:
temp_df["terms"] = considered_one * 30000

In [None]:
with open('prob_4.pickle', 'rb') as f:
    prob_4 = pickle.load(f)

In [None]:
prob_4 = prob_4.toarray().ravel()

In [None]:
temp_df["prob"] = prob_4


In [None]:
del prob_4

In [None]:
temp_df = temp_df[temp_df["prob"] >= 0.6]

In [None]:
temp_df.shape

In [None]:
final_df = pd.concat([final_df, temp_df])
del temp_df
final_df = final_df.reset_index().drop("index", axis = 1)

In [None]:
gc.collect()

In [None]:
temp_df = pd.DataFrame(columns = ["id", "terms", "prob"])

In [None]:
temp_df["id"] = [id_lst[i] for i in range(120000, 141865) for _ in range(1500)]

In [None]:
temp_df["terms"] = considered_one * 21865

In [None]:
with open('prob_5.pickle', 'rb') as f:
    prob_5 = pickle.load(f)

In [None]:
prob_5 = prob_5.toarray().ravel()

In [None]:
temp_df["prob"] = prob_5

In [None]:
del prob_5

In [None]:
temp_df = temp_df[temp_df["prob"] >= 0.6]

In [None]:
temp_df.shape

In [None]:
final_df = pd.concat([final_df, temp_df])
del temp_df
final_df = final_df.reset_index().drop("index", axis = 1)

In [None]:
final_df.shape

In [None]:
# make the submission
final_df.to_csv("/kaggle/working/submission.tsv", index = False, sep = "\t")