# Multiclass classification
In this notebook we will make a multiclass classification, the starting paper is as follows: [A Deep Learning Approach for Viral DNA Sequence Classification using Genetic Algorithm](https://www.researchgate.net/publication/363276607_A_Deep_Learning_Approach_for_Viral_DNA_Sequence_Classification_using_Genetic_Algorithm)

We will propose different models for the task on a different dataset


#### Definition of common import and functions


In [1]:
from json import encoder

import pandas as pd

import keras

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

from tqdm import tqdm
from matplotlib import pyplot as plt

df_path = 'Dataset/MulticlassDatasets/'
epochs = 20

## Models

In [35]:
def get_model(dimension, num_classes):
    inputs = keras.layers.Input(shape=dimension)
    x = keras.layers.LSTM(1024)(inputs)
    x = keras.layers.Dropout(0.2)(x)
    x = keras.layers.Dense(256)(x)
    x = keras.layers.Dropout(0.2)(x)
    x = keras.layers.Dense(num_classes)(x)
    x = keras.layers.Activation('softmax')(x)
    model = keras.models.Model(inputs=inputs, outputs=x)
    return model
    

### Unbalanced data set loading

In [4]:
df = pd.read_parquet(df_path+'UnbalancedDataset.parquet')

df.head()

Unnamed: 0,Accession,Release Date,Species,Genus,Family,Molecule Type,Length,Sequence Type,Host,Collection Date,Sequence
78,NC_077680.1,2023-05-06T00:00:00Z,Tomato mottle leaf curl virus,Begomovirus,Geminiviridae,ssDNA(+/-),2630.0,RefSeq,Solanum lycopersicum,2008.0,ACCGGATGGCCGCGCGGGTTTTTTTGACCCGCTCCGTGATGTATTT...
81,NC_077711.1,2023-05-06T00:00:00Z,Sida micrantha mosaic virus,Begomovirus,Geminiviridae,ssDNA(+/-),2659.0,RefSeq,,,ACCGGATGGCCGCGCGATTTTCCCCCCCCTCACGTGGCGCTCTGGT...
82,NC_077712.1,2023-05-06T00:00:00Z,Sida micrantha mosaic virus,Begomovirus,Geminiviridae,ssDNA(+/-),2629.0,RefSeq,,,ACCGGATGGCCGCGCGATTTTCCCCCCAAAACGTGGCGCTCTGGTG...
83,NC_077719.1,2023-05-06T00:00:00Z,Cotton yellow mosaic virus,Begomovirus,Geminiviridae,ssDNA(+/-),2766.0,RefSeq,Gossypium raimondii,2014.0,ACCGGATGGCCGCGCGCCCGCTTTATGTGGTCCCCCCTTGTGGTCC...
84,NC_077720.1,2023-05-06T00:00:00Z,Cotton yellow mosaic virus,Begomovirus,Geminiviridae,ssDNA(+/-),2716.0,RefSeq,Gossypium raimondii,2014.0,ACCGGATGGCCGCGCGCCCCCTTTTATGTGGCCCACACACAGGATA...


In [5]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,Accession,Release Date,Species,Genus,Family,Molecule Type,Length,Sequence Type,Host,Collection Date,Sequence
0,NC_034547.1,2017-05-15T00:00:00Z,Bat associated cyclovirus 1,Cyclovirus,Circoviridae,ssDNA(+/-),1844.0,RefSeq,Chiroptera,2009-09,ATGGAGAACAAGACCATAAGACGCTTCATCTTCACCTGGAACAATT...
1,NC_008299.1,2006-08-25T00:00:00Z,Pedilanthus leaf curl virus,Begomovirus,Geminiviridae,ssDNA(+/-),2759.0,RefSeq,Solanum lycopersicum,,ACCGGATGGCCGCGCTTTTTTTATGGCCCCCACAGAGCACTAACTG...
2,NC_030460.1,2016-07-12T00:00:00Z,Circovirus-like genome DCCV-10,,Circoviridae,ssDNA(+/-),2320.0,RefSeq,,2010-01,CGGGGTATCTCACCCCAGGGTGTGCCAACACCCTGGCCGGCGTATT...
3,NC_077729.1,2023-05-06T00:00:00Z,Maize rough dwarf virus,Fijivirus,Spinareoviridae,dsRNA,3813.0,RefSeq,Zea mays,2015,AAGTTTTTTACCGGACCCTAAGGATTCACTCAAGATAAAGGACGAA...
4,NC_004100.1,2002-08-01T00:00:00Z,Macroptilium yellow mosaic Florida virus,Begomovirus,Geminiviridae,ssDNA(+/-),2605.0,RefSeq,,,ACCGGATGGCCGCGCCCCGCCCCCCCCTTTCCGTACTCTCGTCCCC...


In [6]:
labels = df['Family']
X = df['Sequence']
X

0       ATGGAGAACAAGACCATAAGACGCTTCATCTTCACCTGGAACAATT...
1       ACCGGATGGCCGCGCTTTTTTTATGGCCCCCACAGAGCACTAACTG...
2       CGGGGTATCTCACCCCAGGGTGTGCCAACACCCTGGCCGGCGTATT...
3       AAGTTTTTTACCGGACCCTAAGGATTCACTCAAGATAAAGGACGAA...
4       ACCGGATGGCCGCGCCCCGCCCCCCCCTTTCCGTACTCTCGTCCCC...
                              ...                        
1944    TCGACATGGCGTACTTGGCACTTCCCGTGCCCGAAGGACCCGACTC...
1945    ACCGGATGGCCGCCCGAAATTTCGTGGTGGTCCCCCCCTGTCGGCC...
1946    ACCGGATGGCCGCGCCCGAAAAAGCAGGTGGACCCCACAATGACCG...
1947    TTACACAAAGAACCCCTTGAATTATCAAAACATGTCCTTGTCTAAA...
1948    ACCGGATGGCCGCGCGATTTTTTTGGTGGGCCTTACCATTAACACT...
Name: Sequence, Length: 1949, dtype: object

I see the maximum length of the sequence so that I can make the dataset containing sequence and target

In [7]:
max_len = []
for el in X:
    max_len.append(len(el))
max_len = max(max_len)
max_len

3999

In [8]:
cols =[]

for col in range(max_len):
    cols.append('nucleotide_' + str(col))
cols

['nucleotide_0',
 'nucleotide_1',
 'nucleotide_2',
 'nucleotide_3',
 'nucleotide_4',
 'nucleotide_5',
 'nucleotide_6',
 'nucleotide_7',
 'nucleotide_8',
 'nucleotide_9',
 'nucleotide_10',
 'nucleotide_11',
 'nucleotide_12',
 'nucleotide_13',
 'nucleotide_14',
 'nucleotide_15',
 'nucleotide_16',
 'nucleotide_17',
 'nucleotide_18',
 'nucleotide_19',
 'nucleotide_20',
 'nucleotide_21',
 'nucleotide_22',
 'nucleotide_23',
 'nucleotide_24',
 'nucleotide_25',
 'nucleotide_26',
 'nucleotide_27',
 'nucleotide_28',
 'nucleotide_29',
 'nucleotide_30',
 'nucleotide_31',
 'nucleotide_32',
 'nucleotide_33',
 'nucleotide_34',
 'nucleotide_35',
 'nucleotide_36',
 'nucleotide_37',
 'nucleotide_38',
 'nucleotide_39',
 'nucleotide_40',
 'nucleotide_41',
 'nucleotide_42',
 'nucleotide_43',
 'nucleotide_44',
 'nucleotide_45',
 'nucleotide_46',
 'nucleotide_47',
 'nucleotide_48',
 'nucleotide_49',
 'nucleotide_50',
 'nucleotide_51',
 'nucleotide_52',
 'nucleotide_53',
 'nucleotide_54',
 'nucleotide_55',
 '

In [9]:
sequence_df = pd.DataFrame(columns = cols)

for row in tqdm(X):
    sequence = {}
    row = list(row)
    for index, i in enumerate(cols):
        if index < len(row):
            sequence[i] = row[index]
        else:
            sequence[i] = -1
            
    sequence_df.loc[len(sequence_df)] = sequence

sequence_df.head()

100%|██████████| 1949/1949 [05:49<00:00,  5.58it/s]


Unnamed: 0,nucleotide_0,nucleotide_1,nucleotide_2,nucleotide_3,nucleotide_4,nucleotide_5,nucleotide_6,nucleotide_7,nucleotide_8,nucleotide_9,...,nucleotide_3989,nucleotide_3990,nucleotide_3991,nucleotide_3992,nucleotide_3993,nucleotide_3994,nucleotide_3995,nucleotide_3996,nucleotide_3997,nucleotide_3998
0,A,T,G,G,A,G,A,A,C,A,...,,,,,,,,,,
1,A,C,C,G,G,A,T,G,G,C,...,,,,,,,,,,
2,C,G,G,G,G,T,A,T,C,T,...,,,,,,,,,,
3,A,A,G,T,T,T,T,T,T,A,...,,,,,,,,,,
4,A,C,C,G,G,A,T,G,G,C,...,,,,,,,,,,


## Convert to numerical and fit the model

In [None]:
n_class = len(labels.unique())
n_class

In [None]:
encoder_x = OrdinalEncoder()
encoder_x.set_params(encoded_missing_value=-1)

encoder_y = OrdinalEncoder()

X = sequence_df.to_numpy()
y = labels.to_numpy()

# Fit the encoder for the labels and the training set
encoder_x.fit(X)
encoder_y.fit(y)

# param for the model
batch_size = 32

# TODO VERIFY THIS SHAPE
dim = (X.shape[0],1,X.shape[1])
dimension = (X.shape[0],X.shape[1])

dim, dimension

In [55]:
model = get_model(dimension, num_classes=n_class-1)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [58]:
# Stratified K-Fold cross validation
skf = StratifiedKFold(n_splits=10, shuffle=True)

n_fold = 0
best_accuracy = 0
best_model = None
best_pred = None
gt_pred = None

for train_index, test_index in skf.split(X, y):
    # Get the train data and split it in train and val
    X_f, y_f = X[train_index],y[train_index]
    X_train, X_val, y_train, y_val = train_test_split(X_f, y_f, test_size=0.2, random_state=42)
    
    # encoding all the value in a numeric form
    X_train = encoder_x.transform(X_train)
    X_val = encoder_x.transform(X_val)
    X_test = encoder_x.transform(X[test_index])
    
    y_train = encoder_y.transform(y_train)
    y_val = encoder_y.transform(y_val)
    y_test = encoder_y.transform(y[test_index])
    
    # fit the model
    history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_val, y_val), verbose=1)
    
    y_pred = model.predict(X_test)
    y_pred = encoder_y.inverse_transform(y_pred)
    
    acc = accuracy_score(y[test_index], y_pred)
    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model
        best_pred = y_pred
        gt_pred = y[test_index]
    
    # plot the fit metrics
    fig, ax = plt.subplots(figsize=(1, 2))
    # summarize history for accuracy
    ax[0].plot(history.history['acc'])
    ax[0].plot(history.history['val_acc'])
    ax[0].legend(['train', 'val'])
    ax[1].plot(history.history['loss'])
    ax[1].plot(history.history['val_loss'])
    ax[1].legend(['train', 'val'])
    fig.title(f'Fold number {n_fold}')
    n_fold += 1

ValueError: Unrecognized data type: x=tensor([[ 0.,  3.,  2.,  ..., -1., -1., -1.],
        [ 0.,  1.,  1.,  ..., -1., -1., -1.],
        [ 1.,  2.,  2.,  ..., -1., -1., -1.],
        ...,
        [ 0.,  1.,  1.,  ..., -1., -1., -1.],
        [ 0.,  1.,  1.,  ..., -1., -1., -1.],
        [ 0.,  0.,  2.,  ..., -1., -1., -1.]]) (of type <class 'torch.Tensor'>)

In [None]:
print(classification_report(y_true=gt_pred, y_pred=best_pred))

In [None]:
print(confusion_matrix(y_true=gt_pred, y_pred=best_pred))

## Explainability of the model