In [None]:
!pip install biopython

from google.colab import drive
drive.mount("/content/drive")

from Bio import SeqIO
import operator
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_table('/content/drive/My Drive/ncbi_dataset/dna_sequencing_project/data/human.txt')
data

Unnamed: 0,sequence,class
0,ATGCCCCAACTAAATACTACCGTATGGCCCACCATAATTACCCCCA...,4
1,ATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAG...,4
2,ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...,3
3,ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...,3
4,ATGCAACAGCATTTTGAATTTGAATACCAGACCAAAGTGGATGGTG...,3
...,...,...
4375,ATGGAAGATTTGGAGGAAACATTATTTGAAGAATTTGAAAACTATT...,0
4376,ATGCAGTCCTTTCGGGAGCAAAGCAGTTACCACGGAAACCAGCAAA...,6
4377,ATGCAGTCCTTTCGGGAGCAAAGCAGTTACCACGGAAACCAGCAAA...,6
4378,ATGGGGCACCTGGTTTGCTGTCTGTGTGGCAAGTGGGCCAGTTACC...,6


In [None]:
def string_to_arr(seq_string):
   seq_string = seq_string.lower()
   seq_string = re.sub('[^acgt]', 'n', seq_string)
   seq_string = np.array(list(seq_string))
   return seq_string



def Kmers(seq, size=6):
    return [seq[x:x+size].lower() for x in range(len(seq) - size + 1)]

In [None]:
data['words'] = data.apply(lambda x: Kmers(x['sequence']), axis=1)
data = data.drop('sequence', axis=1)

In [None]:
data

Unnamed: 0,class,words
0,4,"[atgccc, tgcccc, gcccca, ccccaa, cccaac, ccaac..."
1,4,"[atgaac, tgaacg, gaacga, aacgaa, acgaaa, cgaaa..."
2,3,"[atgtgt, tgtgtg, gtgtgg, tgtggc, gtggca, tggca..."
3,3,"[atgtgt, tgtgtg, gtgtgg, tgtggc, gtggca, tggca..."
4,3,"[atgcaa, tgcaac, gcaaca, caacag, aacagc, acagc..."
...,...,...
4375,0,"[atggaa, tggaag, ggaaga, gaagat, aagatt, agatt..."
4376,6,"[atgcag, tgcagt, gcagtc, cagtcc, agtcct, gtcct..."
4377,6,"[atgcag, tgcagt, gcagtc, cagtcc, agtcct, gtcct..."
4378,6,"[atgggg, tggggc, ggggca, gggcac, ggcacc, gcacc..."


In [None]:
data_texts = list(data['words'])
for item in range(len(data_texts)):
    data_texts[item] = ' '.join(data_texts[item])


y = data.iloc[:, 0].values 

In [None]:
y

array([4, 4, 3, ..., 6, 6, 6])

In [None]:
cv = CountVectorizer(ngram_range=(4,4))
X = cv.fit_transform(data_texts)
X.shape #4380 genes converted to words in kmers

(4380, 232414)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.20,random_state=42)

In [None]:
classifier = MultinomialNB(alpha=0.01)
classifier.fit(X_train, y_train)

MultinomialNB(alpha=0.01)

In [None]:
print(X_test)

predicted = classifier.predict((X_test))

def findclass(number):
  if number == 0:
    out = "G protein coupled receptors"
    print(out)
    return out
  if number == 1:
    out = "Tyrosine kinase"
    print(out)
    return out
  if number == 2:
    out = "Tyrosine phosphatase"
    print(out)
    return out
  if number == 3:
    out = "Synthetase"
    print(out)
    return out
  
  if number == 4:
    out = "Synthase"
    print(out)
    return out
  if number == 5:
    out = "Ion channel"
    print(out)
    return out

  if number == 6:
    out = "Transcription factor"
    print(out)
    return out
    




print(f'class is predicted to be {findclass(round(sum(predicted)/len(predicted)))}')

print("Confusion matrix for predictions on human test DNA sequence\n")
print(pd.crosstab(pd.Series(y_test, name='Actual'), pd.Series(predicted, name='Predicted')))



def get_metrics(y_test, y_predicted):
    accuracy = accuracy_score(y_test, y_predicted)
    precision = precision_score(y_test, y_predicted, average='weighted')
    recall = recall_score(y_test, y_predicted, average='weighted')
    f1 = f1_score(y_test, y_predicted, average='weighted')
    return accuracy, precision, recall, f1
accuracy, precision, recall, f1 = get_metrics(y_test, predicted)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))

  (0, 20674)	1
  (0, 16862)	1
  (0, 174595)	1
  (0, 8654)	1
  (0, 211302)	1
  (0, 210929)	1
  (0, 136593)	1
  (0, 211949)	1
  (0, 152089)	1
  (0, 225870)	1
  (0, 10690)	1
  (0, 171124)	1
  (0, 202495)	1
  (0, 34622)	1
  (0, 34465)	1
  (0, 196610)	1
  (0, 78840)	1
  (0, 36634)	1
  (0, 208524)	1
  (0, 76560)	1
  (0, 148135)	1
  (0, 172331)	1
  (0, 221518)	1
  (0, 189079)	1
  (0, 91388)	1
  :	:
  (875, 41690)	1
  (875, 163947)	1
  (875, 172401)	1
  (875, 97741)	1
  (875, 96816)	1
  (875, 152205)	1
  (875, 139843)	1
  (875, 91905)	1
  (875, 142779)	1
  (875, 103253)	1
  (875, 179907)	1
  (875, 76003)	1
  (875, 121506)	1
  (875, 171371)	1
  (875, 222914)	1
  (875, 194566)	1
  (875, 119958)	1
  (875, 115192)	1
  (875, 118708)	1
  (875, 8935)	1
  (875, 34975)	1
  (875, 138243)	1
  (875, 157017)	1
  (875, 158733)	1
  (875, 165786)	1
Synthase
class is predicted to be Synthase
Confusion matrix for predictions on human test DNA sequence

Predicted   0    1   2    3    4   5    6
Actual           

In [None]:
#prediction on new data