In [1]:
import os
import urllib.request as request
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pycaret.classification import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer

In [2]:
# filename, headers = request.urlretrieve(
#                 url="https://github.com/krishnaik06/DNA-Sequencing-Classifier/raw/master/human_data.txt",
#                 filename="human_data.txt"
#             )

In [34]:
df = pd.read_table("artifacts/data_ingestion/human_data.txt")
df.head()

Unnamed: 0,sequence,class
0,ATGCCCCAACTAAATACTACCGTATGGCCCACCATAATTACCCCCA...,4
1,ATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAG...,4
2,ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...,3
3,ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...,3
4,ATGCAACAGCATTTTGAATTTGAATACCAGACCAAAGTGGATGGTG...,3


In [35]:
# function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
def getKmers(sequence, size=6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

In [36]:
df['kmers'] = df.apply(lambda x: getKmers(x['sequence']), axis=1)
df = df.drop('sequence', axis=1)
df.head()

Unnamed: 0,class,kmers
0,4,"[atgccc, tgcccc, gcccca, ccccaa, cccaac, ccaac..."
1,4,"[atgaac, tgaacg, gaacga, aacgaa, acgaaa, cgaaa..."
2,3,"[atgtgt, tgtgtg, gtgtgg, tgtggc, gtggca, tggca..."
3,3,"[atgtgt, tgtgtg, gtgtgg, tgtggc, gtggca, tggca..."
4,3,"[atgcaa, tgcaac, gcaaca, caacag, aacagc, acagc..."


In [37]:
X_data = list(df['kmers'])
for item in range(len(X_data)):
    X_data[item] = ' '.join(X_data[item])
    
y_data = df.iloc[:, 0].values

In [38]:
data = pd.DataFrame({'sequence': X_data, 'class': y_data})

In [44]:
type(data)

pandas.core.frame.DataFrame

In [43]:
nr = (5,5)
cv = CountVectorizer(ngram_range=nr)
X = cv.fit_transform(list(data['sequence']))

In [7]:
import scipy as sp

sp.sparse.save_npz('X_data.npz', X)

In [None]:
df['class'].value_counts().sort_index().plot.bar()

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_data, test_size = 0.20, random_state=42)

In [14]:
print(y_train[-5])

5


In [15]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(
    loss='hinge',
    penalty='l2',
    alpha=1e-3,
    random_state=42,
    max_iter=5,
    tol=None
)

In [16]:
clf.fit(X_train, y=y_train)

In [17]:
y_pred = clf.predict(X_test)

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# alpha=0.1
# classifier = MultinomialNB(alpha=0.1)
# classifier.fit(X_train, y_train)

clf = LogisticRegression()
clf.fit(X_train, y_train)

In [19]:
y_pred = clf.predict(X_test)

In [20]:
print(y_pred[2])
print(y_test[2])

4
4


In [21]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
print("Confusion matrix\n")
print(pd.crosstab(pd.Series(y_test_cv, name='Actual'), pd.Series(y_pred, name='Predicted')))
def get_metrics(y_test, y_predicted):
    accuracy = accuracy_score(y_test, y_predicted)
    precision = precision_score(y_test, y_predicted, average='weighted')
    recall = recall_score(y_test, y_predicted, average='weighted')
    f1 = f1_score(y_test, y_predicted, average='weighted')
    return accuracy, precision, recall, f1
accuracy, precision, recall, f1 = get_metrics(y_test_cv, y_pred)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))

Confusion matrix

Predicted   0    1   2    3    4   5    6
Actual                                   
0          86    0   0    0    0   0   16
1           0  100   0    0    0   0    6
2           0    0  73    0    0   0    5
3           0    0   0  117    0   0    8
4           0    0   0    0  134   0   15
5           0    0   0    0    0  45    6
6           0    0   0    0    0   0  265
accuracy = 0.936 
precision = 0.947 
recall = 0.936 
f1 = 0.937


In [6]:
from dnaseq.utils.common import *
from pathlib import Path
import yaml
from box import ConfigBox
from ensure import ensure_annotations
from box.exceptions import BoxValueError

In [31]:
with open(Path('params.yaml')) as f:
    p = yaml.full_load(f.read())
    p = ConfigBox(p)

In [34]:
p = read_yaml(Path('params.yaml'))

In [37]:
params1 = [d[1] for d in list(p.items())]

In [39]:
params1[0]['classifier'] = clf1
params1[1]['classifier'] = clf2

ConfigBox({'preprocessor__ngram_range': [(3, 3), (4, 4), (5, 5), (6, 6)], 'classifier__penalty': ['l2', 'elasticnet'], 'classifier__C': [0.1, 1, 10], 'classifier__solver': ['lbfgs', 'liblinear']})

In [5]:
train_data = pd.read_csv('artifacts/data_transformation/train.csv')
train_x = train_data.drop(['class'], axis=1).values.astype('U')
train_y = train_data['class']

test_data = pd.read_csv('artifacts/data_transformation/test.csv')
test_x = test_data.drop(['class'], axis=1).values.astype('U')
test_y = test_data['class']

# train_x = list(train_x['sequence'])
# train_y = list(train_y)

# nr = (5,5)
# cv = CountVectorizer(ngram_range=nr)
# train_x = cv.fit_transform(train_x['sequence'].values.astype('U'))

In [56]:
train_x

array([['aaaagg aaagga aaggag aggagg ggaggc gaggcg aggcgt ggcgtg gcgtgc cgtgca gtgcag tgcagc gcagcg cagcga agcgag gcgagc cgagca gagcat agcatg gcatga catgaa atgaac tgaacc gaaccg aaccgg accggc ccggct cggctg ggctgg gctggg ctgggc tgggct gggctg ggctgc gctgct ctgctg tgctgg gctggc ctggcc tggccc ggcccg gcccgc cccgca ccgcaa cgcaag gcaagg caagga aaggag aggagc ggagcc gagcct agcctg gcctgc cctgcc ctgcca tgccaa gccaag ccaagg caaggt aaggtc aggtca ggtcag gtcagg tcaggc caggca aggcag ggcagg gcaggg cagggg aggggc ggggca gggcag ggcagt gcagtg cagtgg agtggg gtgggc tgggca gggcag ggcaga gcagag cagagg agagga gaggag aggagt ggagtc gagtct agtctt gtcttg tcttgg cttgga ttggac tggact ggactt gacttg acttgg cttggg ttgggg tggggg gggggc ggggcc gggcca ggccac gccacc ccaccc caccct accctt cccttc ccttca cttcag ttcagg tcaggt caggtc aggtcc ggtccc gtccca tcccag cccagg ccaggc caggca aggcat ggcatc gcatcc catcca atccag tccagg ccaggc caggca aggcac ggcacc gcaccc cacccc acccca ccccag cccagc ccagca cagcag agcagc gcagcg cagcga agcgaa gcga

In [6]:
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [7]:
clf1 = LogisticRegression()
clf2 = MultinomialNB()
vec = CountVectorizer()

In [8]:
params1 = [ConfigBox({
    'preprocessor__ngram_range': [(3,3), (4,4)],
    'classifier__C': [0.1, 1],
    'classifier': [clf1]
}),
          ConfigBox({
    'preprocessor__ngram_range': [(3,3), (5,5)],
    'classifier__alpha': [0.1, 1.0],
    'classifier': [clf2]
})]

In [9]:
print(params1)

[ConfigBox({'preprocessor__ngram_range': [(3, 3), (4, 4)], 'classifier__C': [0.1, 1], 'classifier': [LogisticRegression()]}), ConfigBox({'preprocessor__ngram_range': [(3, 3), (5, 5)], 'classifier__alpha': [0.1, 1.0], 'classifier': [MultinomialNB()]})]


In [None]:
outer_cv = KFold(n_splits=5, shuffle=True, random_state=1)

pipeline = Pipeline(
    [
        ('preprocessor', vec),
        ('classifier', clf1),
    ]
)

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=params1,
    scoring='accuracy',
    cv=outer_cv,
    n_jobs=1,
    verbose=3,
    return_train_score=True
)

grid.fit(train_x.ravel(), train_y.ravel())

params = grid.best_params_
model = params['classifier']

In [68]:
vec = grid.best_estimator_['preprocessor']
model = grid.best_estimator_['classifier']

In [88]:
best = grid.best_estimator_

In [89]:
best

In [69]:
train_x = vec.fit_transform(train_x.ravel())
test_x = vec.transform(test_x.ravel())

In [70]:
model.fit(train_x, train_y)

In [71]:
pred = model.predict(test_x)

In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score

# cm = confusion_matrix(test_y, pred)

In [73]:
cm

array([[117,   0,   0,   0,   1,   1,   3],
       [  0, 125,   0,   0,   0,   0,   2],
       [  0,   0,  86,   0,   0,   0,   0],
       [  0,   0,   0, 175,   4,   0,   0],
       [  1,   0,   0,   0, 176,   0,   3],
       [  0,   0,   0,   0,   0,  55,   0],
       [  1,   0,   0,   0,   7,   0, 338]])

In [65]:
accuracy_score(test_y, pred)

0.9698630136986301

In [74]:
cl = MultinomialNB()

In [2]:
import joblib

# joblib.dump(best, os.path.join('', 'model.joblib'))

In [3]:
estimator = joblib.load('model.joblib')

In [9]:
test_x = estimator['preprocessor'].transform(test_x.ravel())

In [10]:
pred = estimator['classifier'].predict(test_x)

In [18]:
cm = confusion_matrix(test_y, pred)
acc = accuracy_score(test_y, pred)

In [21]:
from dnaseq.utils.common import save_json
from pathlib import Path

In [27]:
with open('cm.txt', 'w') as f:
    f.write(str(cm))