# DNA Sequencing

### Load Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
df = pd.read_table("human.txt")
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

### Visualization

In [None]:
gene_family = ['G protein coupled receptors', 'Tyrosine kinase', 'Tyrosine phosphatase', 'Synthetase', 'Synthase', 'Ion channel', 'Transcription factor']

counts = {}
for key, value in df['class'].value_counts().to_dict().items():
    counts[gene_family[key]] = value
print(counts)

In [None]:
sns.countplot(df['class'], label='Count') 

In [None]:
sns.countplot(df['sequence'].value_counts(), label='Count') 

### Preprocess

In [None]:
def k_mer_counting(sequence, size=6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

In [None]:
df['k_mer_sequence'] = df.apply(lambda x: k_mer_counting(x['sequence']), axis=1)
df = df.drop(['sequence'], axis=1)

In [None]:
df.head()

In [None]:
X = list(df['k_mer_sequence'].values)
X = [' '.join(i) for i in X]
X = np.array(X)

y = df['class']

In [None]:
cv = CountVectorizer(ngram_range=(4,4))
X = cv.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Models Training + Evaluation

In [None]:
models = [
    KNeighborsClassifier(), 
    RandomForestClassifier(),
    LogisticRegression(multi_class='multinomial'),
    MultinomialNB(alpha=0.1)
]

for m in models: 
    m.fit(X_train, y_train)
    name = m.__class__.__name__
    y_pred = m.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy of {}: {}".format(name, acc))
    print(classification_report(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(14,10))
    sns.heatmap(cm, cmap='Blues', linecolor='black', linewidth=1, annot=True, fmt='')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

In [None]:
'''
Inspiration
1. https://github.com/krishnaik06/DNA-Sequencing-Classifier
'''