In [281]:
# set the directory to where the data is
import os

os.chdir(r"D:\Gene_Project")

In [282]:
# pandas for dealing with the data
import pandas as pd
# setting for seeing the entire string
pd.options.display.max_colwidth = None
pd.set_option('display.max_rows', 500)

In [283]:
# load the data - new data that was provided
data = pd.read_csv(r"pul_seq_low_high_substr_year_corrected.tsv", sep = "\t").dropna().sample(frac = 1.0)

In [284]:
data.head()

Unnamed: 0,PULid,sig_gene_seq,low_level_substr,high_level_substr,Pub_year
337,PUL0340,"HTH_psq,GH5_2,3.A.3,HMA,Aminotran_1_2,HTH_3,Trans_reg_C,HisKA|HATPase_c,MerR,Sigma70_r4_2,HTH_3,HTH_3,HTH_3","carboxymethylcellulose,xylan,beta-glucan,lichenan",multiple_substrates,2017
91,PUL0092,"GH20,GH2,GH20,GH33,8.A.46,1.B.14","O-glycan,N-glycan",multiple_substrates,2019
79,PUL0080,"CE4,GT4,GT2","capsule polysaccharide,outer core capsule polysaccharide",multiple_substrates,2019
39,PUL0040,"GH9,GH9|CBM3,GH5_17,GH9,PL11,GH5_1",cellulose,cellulose,200319922002
515,PUL0518,"Fer4,GT2,GT2,GT4,GT4,9.B.146",capsule polysaccharide,capsule polysaccharide,2010


In [285]:
from sklearn.model_selection import train_test_split

In [286]:
# get the multiple substrates data
multiple_substrates = data[data["high_level_substr"] == "multiple_substrates"].reset_index(drop = True)

In [287]:
multiple_substrates.head()

Unnamed: 0,PULid,sig_gene_seq,low_level_substr,high_level_substr,Pub_year
0,PUL0340,"HTH_psq,GH5_2,3.A.3,HMA,Aminotran_1_2,HTH_3,Trans_reg_C,HisKA|HATPase_c,MerR,Sigma70_r4_2,HTH_3,HTH_3,HTH_3","carboxymethylcellulose,xylan,beta-glucan,lichenan",multiple_substrates,2017
1,PUL0092,"GH20,GH2,GH20,GH33,8.A.46,1.B.14","O-glycan,N-glycan",multiple_substrates,2019
2,PUL0080,"CE4,GT4,GT2","capsule polysaccharide,outer core capsule polysaccharide",multiple_substrates,2019
3,PUL0442,"GH18|GH18,GH18|CBM6,GH16|CBM6,GerE,FecR,1.B.14","glucomannan,galactomannan,glucose",multiple_substrates,2017
4,PUL0438,"1.B.14,8.A.46,GH89,DUF24,4.D.3,GH158","glucomannan,glucose",multiple_substrates,2017


In [288]:
# get the low level frequencies
multiple_substrates_low_level = multiple_substrates["low_level_substr"]

In [289]:
multiple_substrates_low_level = [gene for seq in multiple_substrates_low_level_df for gene in str(seq).split(",")]

In [290]:
from collections import Counter

In [291]:
freq_counts = Counter(multiple_substrates_low_level)

In [292]:
freq_counts=dict(sorted(freq_counts.items(),key= lambda x:x[1], reverse = True))

In [293]:
freq_counts

{'O-glycan': 28,
 'N-glycan': 23,
 'xylan': 22,
 'beta-glucan': 17,
 'lichenan': 12,
 'glucomannan': 12,
 'sucrose': 11,
 'kestose': 11,
 'galactomannan': 10,
 'cellobiose': 9,
 'mucin': 8,
 'nystose': 8,
 'glucose': 7,
 'melibiose': 7,
 'host glycan': 7,
 'capsule polysaccharide': 6,
 'arabinan': 6,
 'carboxymethylcellulose': 6,
 'pectin': 6,
 'lactose': 6,
 'maltose': 4,
 'outer core capsule polysaccharide': 4,
 'raffinose': 4,
 'fructan': 4,
 'arabinoxylan': 4,
 'agar': 4,
 'levan': 4,
 'rhamnogalacturonan': 4,
 'inulin': 4,
 'cellulose': 3,
 'laminarin': 3,
 'alginate': 3,
 'fructooligosaccharide': 3,
 'exopolysaccharide': 3,
 'xylobiose': 3,
 'porphyran': 3,
 'starch': 3,
 'galactose': 2,
 'xylose': 2,
 'stachyose': 2,
 'ulvan': 2,
 'arabinose': 2,
 'maltotriose': 2,
 'lipopolysaccharide': 2,
 'maltodextrin': 2,
 'N-acetyl-D-galactosamine': 2,
 'xyloglucan': 2,
 'O-antigen': 2,
 'alpha-glucan': 2,
 'glucosamine': 2,
 'sophorose': 1,
 'laminaribiose': 1,
 'beta-glucoside': 1,
 'raf

In [294]:
keep_these = [gene for gene, freq in freq_counts.items() if freq >= 10]

In [295]:
catch = []

for low_level_class in multiple_substrates["low_level_substr"]: 
    split_list = low_level_class.split(",")
    split_list = [inner_class if inner_class in keep_these else 'others'  for inner_class in split_list]
#     new_low_level = ",".join(split_list)
    catch.append(split_list)

In [318]:
catch = [list(np.unique(list1)) for list1 in catch]

In [319]:
X_train, X_test, y_train, y_test = train_test_split(multiple_substrates["sig_gene_seq"],
                                                    catch, test_size=0.30, random_state=42)

In [320]:
one_hot = MultiLabelBinarizer()

In [321]:
one_hot.fit(y_train)

MultiLabelBinarizer()

In [322]:
one_hot.classes_

array(['N-glycan', 'O-glycan', 'beta-glucan', 'galactomannan',
       'glucomannan', 'kestose', 'lichenan', 'others', 'sucrose', 'xylan'],
      dtype=object)

In [323]:
len(one_hot.classes_)

10

In [324]:
labels_train = one_hot.transform(y_train)

In [325]:
labels_test = one_hot.transform(y_test)

In [326]:
from sklearn.feature_extraction.text import CountVectorizer

In [327]:
vectorizer = CountVectorizer(tokenizer=lambda x: str(x).replace("|", ",").split(','), lowercase = False)

In [328]:
vectorizer.fit(X_train.values)



CountVectorizer(lowercase=False,
                tokenizer=<function <lambda> at 0x0000019CC88B2670>)

In [329]:
X_train = vectorizer.transform(X_train.values)

In [330]:
X_test = vectorizer.transform(X_test.values)

In [331]:
X_train

<96x168 sparse matrix of type '<class 'numpy.int64'>'
	with 533 stored elements in Compressed Sparse Row format>

In [407]:
rf = RandomForestClassifier(n_jobs = 6)

In [384]:
# from imblearn.ensemble import BalancedRandomForestClassifier

In [377]:
# rf = BalancedRandomForestClassifier(n_jobs = 6)

In [408]:
rf.fit(X_train, labels_train)

RandomForestClassifier(n_jobs=6)

In [409]:
y_test_pred = rf.predict(X_test)

In [410]:
y_test

[['others'],
 ['others'],
 ['N-glycan', 'O-glycan'],
 ['others'],
 ['others'],
 ['others', 'xylan'],
 ['N-glycan', 'O-glycan'],
 ['others'],
 ['kestose', 'sucrose'],
 ['others'],
 ['beta-glucan', 'lichenan', 'xylan'],
 ['O-glycan', 'others'],
 ['kestose', 'others', 'sucrose'],
 ['others', 'xylan'],
 ['others', 'xylan'],
 ['N-glycan', 'others'],
 ['galactomannan', 'glucomannan', 'others'],
 ['others'],
 ['others'],
 ['others'],
 ['others'],
 ['glucomannan', 'others'],
 ['O-glycan', 'others'],
 ['beta-glucan', 'lichenan', 'xylan'],
 ['others', 'xylan'],
 ['others'],
 ['others'],
 ['N-glycan', 'O-glycan'],
 ['beta-glucan', 'lichenan', 'xylan'],
 ['N-glycan', 'O-glycan'],
 ['kestose', 'others', 'sucrose'],
 ['beta-glucan', 'lichenan', 'xylan'],
 ['others'],
 ['others'],
 ['others'],
 ['N-glycan', 'O-glycan'],
 ['others', 'xylan'],
 ['beta-glucan', 'lichenan', 'others', 'xylan'],
 ['N-glycan', 'O-glycan'],
 ['beta-glucan', 'others'],
 ['glucomannan', 'others'],
 ['O-glycan', 'others']]

In [411]:
y_test_pred_labels = [list(one_hot.classes_[list(np.where(labels ==1)[0])]) for labels in y_test_pred]

In [412]:
y_test_pred_labels

[['others'],
 ['others'],
 ['others'],
 ['others'],
 ['others'],
 ['others'],
 ['N-glycan', 'O-glycan'],
 ['others'],
 ['kestose', 'sucrose'],
 ['others'],
 ['beta-glucan', 'others'],
 ['others'],
 ['kestose', 'others', 'sucrose'],
 ['others', 'xylan'],
 ['others'],
 ['others'],
 ['others'],
 ['others'],
 ['others'],
 ['others'],
 ['others'],
 ['others'],
 ['others'],
 ['beta-glucan'],
 ['O-glycan', 'others'],
 ['others'],
 ['others'],
 ['N-glycan', 'O-glycan'],
 ['beta-glucan', 'others'],
 ['N-glycan', 'O-glycan'],
 ['others'],
 ['beta-glucan', 'glucomannan', 'others'],
 ['others'],
 ['others'],
 ['others'],
 ['N-glycan', 'O-glycan'],
 ['others'],
 ['others'],
 ['N-glycan', 'O-glycan'],
 ['others'],
 ['O-glycan', 'others'],
 ['O-glycan', 'others']]

In [413]:
accuracy_vectors = [len(set(n).intersection(y_test_pred_labels[i]))/len(n) for  i, n in enumerate(y_test)]

In [414]:
np.mean(accuracy_vectors)

0.7440476190476191

In [415]:
catch = []

for class_label in one_hot.classes_:
#     print(class_label)
    inner_actual = []
    inner_predicted = []
    for index in range(0, len(y_test)): 
        actual = class_label in y_test[index]
        pred = class_label in y_test_pred_labels[index]
        if actual == True: 
            inner_actual.append(actual)
            inner_predicted.append(pred)
    dict1 = {"Substrate": class_label, "accuracy": np.mean(np.array(inner_actual) == np.array(inner_predicted)), 
            "How_many": len(inner_actual)}
    catch.append(dict1)
#     print(np.mean(np.array(inner_actual) == np.array(inner_predicted)))
    
    

In [416]:
test_data_df = pd.DataFrame(catch)

In [417]:
test_data_df.sort_values("accuracy", ascending = False).reset_index(drop = True)

Unnamed: 0,Substrate,accuracy,How_many
0,others,1.0,31
1,N-glycan,0.714286,7
2,O-glycan,0.666667,9
3,beta-glucan,0.666667,6
4,kestose,0.666667,3
5,sucrose,0.666667,3
6,xylan,0.1,10
7,galactomannan,0.0,1
8,glucomannan,0.0,3
9,lichenan,0.0,5


In [148]:
# import tensorflow as tf

In [173]:
# input_layer = tf.keras.layers.Input(shape = (X_train.shape[1],))

In [151]:
# first_hidden = tf.keras.layers.Dense(48)

In [152]:
# first_hidden_output = first_hidden(input_layer)

In [174]:
# class_layer = tf.keras.layers.Dense(len(one_hot.classes_), activation = "sigmoid")

In [175]:
# class_output = class_layer(input_layer)

In [176]:
# model = tf.keras.models.Model(input_layer, class_output)

In [178]:
# model.summary()