In [13]:
# set the directory to where the data is
import os

os.chdir(r"D:\Gene_Project")

In [14]:
# pandas for dealing with the data
import pandas as pd
# setting for seeing the entire string
pd.options.display.max_colwidth = None
pd.set_option('display.max_rows', 500)

In [15]:
# load the data - new data that was provided
data = pd.read_csv(r"pul_seq_low_high_substr_year_corrected.tsv", sep = "\t").dropna().sample(frac = 1.0)

In [16]:
data.head()

Unnamed: 0,PULid,sig_gene_seq,low_level_substr,high_level_substr,Pub_year
562,PUL0565,"2.A.2,GH130,GH36,GH26,GH26,1.B.14,HTH_AraC","galactomannan,glucomannan",multiple_substrates,20112017
409,PUL0412,"GH28,2.A.1",polygalacturonic acid,pectin,2003
358,PUL0361,"8.A.9,3.A.1,3.A.1,3.A.1,GH13_20,GH4","starch,maltodextrin",multiple_substrates,2000
422,PUL0425,"PfkB,2.A.1,GH32,GH91,GH91,1.B.14,GH32","fructan,inulin",multiple_substrates,2011
459,PUL0462,"GT2,GT4,GT2",O-antigen,O-antigen,2014


In [17]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
# get the multiple substrates data
multiple_substrates = data[data["high_level_substr"] == "multiple_substrates"].reset_index(drop = True)

In [20]:
multiple_substrates.head()

Unnamed: 0,PULid,sig_gene_seq,low_level_substr,high_level_substr,Pub_year
0,PUL0565,"2.A.2,GH130,GH36,GH26,GH26,1.B.14,HTH_AraC","galactomannan,glucomannan",multiple_substrates,20112017
1,PUL0361,"8.A.9,3.A.1,3.A.1,3.A.1,GH13_20,GH4","starch,maltodextrin",multiple_substrates,2000
2,PUL0425,"PfkB,2.A.1,GH32,GH91,GH91,1.B.14,GH32","fructan,inulin",multiple_substrates,2011
3,PUL0097,"1.B.14,8.A.46,GH33,GH20,GH2,GH20,GH20,GH92,GH2,CE9,3.A.1","O-glycan,N-glycan",multiple_substrates,2019
4,PUL0102,"GH20,GH29,8.A.46,1.B.14,GH33,MarR,HTH_3,CBM67|GH78,HTH_AraC","O-glycan,N-glycan",multiple_substrates,2019


In [21]:
# get the low level frequencies
multiple_substrates_low_level = multiple_substrates["low_level_substr"]

In [22]:
multiple_substrates_low_level = [gene for seq in multiple_substrates_low_level for gene in str(seq).split(",")]

In [23]:
from collections import Counter

In [24]:
freq_counts = Counter(multiple_substrates_low_level)

In [25]:
freq_counts=dict(sorted(freq_counts.items(),key= lambda x:x[1], reverse = True))

In [26]:
freq_counts

{'O-glycan': 28,
 'N-glycan': 23,
 'xylan': 22,
 'beta-glucan': 17,
 'glucomannan': 12,
 'lichenan': 12,
 'sucrose': 11,
 'kestose': 11,
 'galactomannan': 10,
 'cellobiose': 9,
 'mucin': 8,
 'nystose': 8,
 'melibiose': 7,
 'glucose': 7,
 'host glycan': 7,
 'carboxymethylcellulose': 6,
 'lactose': 6,
 'arabinan': 6,
 'pectin': 6,
 'capsule polysaccharide': 5,
 'fructan': 4,
 'inulin': 4,
 'raffinose': 4,
 'arabinoxylan': 4,
 'levan': 4,
 'rhamnogalacturonan': 4,
 'maltose': 4,
 'agar': 4,
 'starch': 3,
 'alginate': 3,
 'outer core capsule polysaccharide': 3,
 'exopolysaccharide': 3,
 'xylobiose': 3,
 'cellulose': 3,
 'laminarin': 3,
 'fructooligosaccharide': 3,
 'porphyran': 3,
 'maltodextrin': 2,
 'ulvan': 2,
 'O-antigen': 2,
 'N-acetyl-D-galactosamine': 2,
 'lipopolysaccharide': 2,
 'glucosamine': 2,
 'galactose': 2,
 'maltotriose': 2,
 'xyloglucan': 2,
 'stachyose': 2,
 'alpha-glucan': 2,
 'xylose': 2,
 'arabinose': 2,
 'isomaltotriose': 1,
 'mannose': 1,
 'glycosaminoglycan': 1,
 'c

In [29]:
# get the class distributions
set(data["high_level_substr"].value_counts().keys()).intersection(freq_counts)

{'N-glycan',
 'O-antigen',
 'O-glycan',
 'alpha-glucan',
 'capsule polysaccharide',
 'cellulose',
 'chitin',
 'exopolysaccharide',
 'fructan',
 'glycosaminoglycan',
 'hemicellulose',
 'host glycan',
 'human milk oligosaccharide',
 'pectin',
 'plant polysaccharide',
 'xylan',
 'xyloglucan'}

In [None]:
keep_these = [gene for gene, freq in freq_counts.items() if freq >= 10]

In [None]:
catch = []

for low_level_class in multiple_substrates["low_level_substr"]: 
    split_list = low_level_class.split(",")
    split_list = [inner_class if inner_class in keep_these else 'others'  for inner_class in split_list]
#     new_low_level = ",".join(split_list)
    catch.append(split_list)

In [None]:
catch = [list(np.unique(list1)) for list1 in catch]

In [None]:
vectorizer = CountVectorizer(tokenizer=lambda x: str(x).replace("|", ",").split(','), lowercase = False)

In [319]:
X_train, X_test, y_train, y_test = train_test_split(multiple_substrates["sig_gene_seq"],
                                                    catch, test_size=0.30, random_state=42)

In [320]:
one_hot = MultiLabelBinarizer()

In [321]:
one_hot.fit(y_train)

MultiLabelBinarizer()

In [322]:
one_hot.classes_

array(['N-glycan', 'O-glycan', 'beta-glucan', 'galactomannan',
       'glucomannan', 'kestose', 'lichenan', 'others', 'sucrose', 'xylan'],
      dtype=object)

In [323]:
len(one_hot.classes_)

10

In [324]:
labels_train = one_hot.transform(y_train)

In [325]:
labels_test = one_hot.transform(y_test)

In [326]:
from sklearn.feature_extraction.text import CountVectorizer

In [327]:
vectorizer = CountVectorizer(tokenizer=lambda x: str(x).replace("|", ",").split(','), lowercase = False)

In [328]:
vectorizer.fit(X_train.values)



CountVectorizer(lowercase=False,
                tokenizer=<function <lambda> at 0x0000019CC88B2670>)

In [329]:
X_train = vectorizer.transform(X_train.values)

In [330]:
X_test = vectorizer.transform(X_test.values)

In [331]:
X_train

<96x168 sparse matrix of type '<class 'numpy.int64'>'
	with 533 stored elements in Compressed Sparse Row format>

In [407]:
rf = RandomForestClassifier(n_jobs = 6)

In [384]:
# from imblearn.ensemble import BalancedRandomForestClassifier

In [377]:
# rf = BalancedRandomForestClassifier(n_jobs = 6)

In [408]:
rf.fit(X_train, labels_train)

RandomForestClassifier(n_jobs=6)

In [409]:
y_test_pred = rf.predict(X_test)

In [410]:
y_test

[['others'],
 ['others'],
 ['N-glycan', 'O-glycan'],
 ['others'],
 ['others'],
 ['others', 'xylan'],
 ['N-glycan', 'O-glycan'],
 ['others'],
 ['kestose', 'sucrose'],
 ['others'],
 ['beta-glucan', 'lichenan', 'xylan'],
 ['O-glycan', 'others'],
 ['kestose', 'others', 'sucrose'],
 ['others', 'xylan'],
 ['others', 'xylan'],
 ['N-glycan', 'others'],
 ['galactomannan', 'glucomannan', 'others'],
 ['others'],
 ['others'],
 ['others'],
 ['others'],
 ['glucomannan', 'others'],
 ['O-glycan', 'others'],
 ['beta-glucan', 'lichenan', 'xylan'],
 ['others', 'xylan'],
 ['others'],
 ['others'],
 ['N-glycan', 'O-glycan'],
 ['beta-glucan', 'lichenan', 'xylan'],
 ['N-glycan', 'O-glycan'],
 ['kestose', 'others', 'sucrose'],
 ['beta-glucan', 'lichenan', 'xylan'],
 ['others'],
 ['others'],
 ['others'],
 ['N-glycan', 'O-glycan'],
 ['others', 'xylan'],
 ['beta-glucan', 'lichenan', 'others', 'xylan'],
 ['N-glycan', 'O-glycan'],
 ['beta-glucan', 'others'],
 ['glucomannan', 'others'],
 ['O-glycan', 'others']]

In [411]:
y_test_pred_labels = [list(one_hot.classes_[list(np.where(labels ==1)[0])]) for labels in y_test_pred]

In [412]:
y_test_pred_labels

[['others'],
 ['others'],
 ['others'],
 ['others'],
 ['others'],
 ['others'],
 ['N-glycan', 'O-glycan'],
 ['others'],
 ['kestose', 'sucrose'],
 ['others'],
 ['beta-glucan', 'others'],
 ['others'],
 ['kestose', 'others', 'sucrose'],
 ['others', 'xylan'],
 ['others'],
 ['others'],
 ['others'],
 ['others'],
 ['others'],
 ['others'],
 ['others'],
 ['others'],
 ['others'],
 ['beta-glucan'],
 ['O-glycan', 'others'],
 ['others'],
 ['others'],
 ['N-glycan', 'O-glycan'],
 ['beta-glucan', 'others'],
 ['N-glycan', 'O-glycan'],
 ['others'],
 ['beta-glucan', 'glucomannan', 'others'],
 ['others'],
 ['others'],
 ['others'],
 ['N-glycan', 'O-glycan'],
 ['others'],
 ['others'],
 ['N-glycan', 'O-glycan'],
 ['others'],
 ['O-glycan', 'others'],
 ['O-glycan', 'others']]

In [413]:
accuracy_vectors = [len(set(n).intersection(y_test_pred_labels[i]))/len(n) for  i, n in enumerate(y_test)]

In [414]:
np.mean(accuracy_vectors)

0.7440476190476191

In [415]:
catch = []

for class_label in one_hot.classes_:
#     print(class_label)
    inner_actual = []
    inner_predicted = []
    for index in range(0, len(y_test)): 
        actual = class_label in y_test[index]
        pred = class_label in y_test_pred_labels[index]
        if actual == True: 
            inner_actual.append(actual)
            inner_predicted.append(pred)
    dict1 = {"Substrate": class_label, "accuracy": np.mean(np.array(inner_actual) == np.array(inner_predicted)), 
            "How_many": len(inner_actual)}
    catch.append(dict1)
#     print(np.mean(np.array(inner_actual) == np.array(inner_predicted)))
    
    

In [416]:
test_data_df = pd.DataFrame(catch)

In [417]:
test_data_df.sort_values("accuracy", ascending = False).reset_index(drop = True)

Unnamed: 0,Substrate,accuracy,How_many
0,others,1.0,31
1,N-glycan,0.714286,7
2,O-glycan,0.666667,9
3,beta-glucan,0.666667,6
4,kestose,0.666667,3
5,sucrose,0.666667,3
6,xylan,0.1,10
7,galactomannan,0.0,1
8,glucomannan,0.0,3
9,lichenan,0.0,5
