In [1]:
import pandas as pd

In [2]:
# set the directory to where the data is
import os

os.chdir(r"D:\Gene_Project")

In [3]:
unsupervised_data = pd.read_csv(r"all_unsupervised_genes.csv")

In [4]:
unsupervised_data.head()

Unnamed: 0,sequence
0,"1.A.72,MerR,GH23"
1,"3.A.1,CE4"
2,"3.A.23,9.A.5,MCPsignal,2.A.21,2.A.22,TetR_N,3...."
3,"CE4,8.A.5,3.A.1,3.A.1,3.A.1"
4,"GT51,Peripla_BP_2,3.A.1,3.A.1,9.B.169"


In [5]:
unsupervised_data.shape

(240622, 1)

In [6]:
# load the data - new data that was provided
supervised_data = pd.read_csv(r"pul_seq_low_high_substr_year_corrected.csv")

In [7]:
supervised_strings = supervised_data["sig_gene_seq"]

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
le = LabelEncoder()

In [10]:
supervised_data["high_level_substr"].value_counts()

multiple_substrates           139
mono/di/trisaccharide          78
capsule polysaccharide         60
algal glycans                  40
pectin                         38
xylan                          38
O-antigen                      37
galactan                       36
alpha-glucan                   23
beta-mannan                    19
cellulose                      18
chitin                         15
glycosaminoglycan              14
beta-glucan                    13
N-glycan                       12
exopolysaccharide              10
-                               9
fructan                         8
alpha-mannan                    8
host glycan                     7
xyloglucan                      7
plant polysaccharide            6
glycoprotein                    6
human milk oligosaccharide      5
hemicellulose                   5
O-glycan                        3
Name: high_level_substr, dtype: int64

In [11]:
to_keep = supervised_data["high_level_substr"].value_counts().keys()[:5].tolist()

In [12]:
supervised_data["high_level_substr"] = [label if label in to_keep else "others" for label in supervised_data["high_level_substr"]]

In [13]:
le.fit(supervised_data["high_level_substr"])

LabelEncoder()

In [14]:
supervised_data["high_level_substr"]  = le.transform(supervised_data["high_level_substr"])

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(supervised_data[["sig_gene_seq"]],
                                                    supervised_data[["high_level_substr"]], 
                                                   test_size=0.4, 
                                                   stratify = supervised_data["high_level_substr"], 
                                                   random_state = 42)

In [17]:
from sklearn.model_selection import PredefinedSplit

In [18]:
import numpy as np

In [19]:
unsup_data = unsupervised_data[["sequence"]].sample(frac = 1.0)

In [20]:
unsup_data.shape

(240622, 1)

In [21]:
X_train = np.vstack((X_train, unsup_data))

In [22]:
y_train = np.vstack((y_train, np.repeat(-1, len(unsup_data)).reshape(-1,1)))

In [23]:
X_train_test = np.vstack((X_train, X_test))

In [24]:
y_train_test = np.vstack((y_train, 
           y_test))

In [25]:
test_fold = np.hstack((np.repeat(-1, X_train.shape[0]), 
           np.repeat(0, X_test.shape[0])))

In [26]:
pds = PredefinedSplit(test_fold)

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline, Pipeline

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
from sklearn.semi_supervised import SelfTrainingClassifier

In [31]:
# SelfTraining Pipeline
st_pipeline = Pipeline(
    [
        ("vect", CountVectorizer(tokenizer=lambda x: str(x).replace("|", ",").split(','),
                                              lowercase = False)),
        ("clf", SelfTrainingClassifier(RandomForestClassifier(n_jobs = 7), verbose=True, threshold = 0.95))
    ]
)

In [32]:
from sklearn.model_selection import cross_val_score

In [33]:
cross_val_score(st_pipeline, X_train_test, y_train_test, cv = pds, n_jobs = 7)[0]

0.6068702290076335

In [34]:
from sklearn.semi_supervised import LabelSpreading, LabelPropagation

In [35]:
from sklearn.preprocessing import FunctionTransformer

In [36]:
# SelfTraining Pipeline
st_pipeline = Pipeline(
    [
        ("vect", CountVectorizer(tokenizer=lambda x: str(x).replace("|", ",").split(','),
                                              lowercase = False)),
         ("todense", FunctionTransformer(lambda x: x.todense())),
        ("clf", LabelPropagation( kernel = "knn", n_jobs = 7))
    ]
)

In [None]:
cross_val_score(st_pipeline, X_train_test, y_train_test, cv = pds, n_jobs = 7)[0]