# Problem Description

1-Data

The data consists of labelled amino acid sequences. Each sequence has a unique ID, the amino acid sequence, the organism it came from and the label. You must predict the label for the test set. Labels consist of one of 20 classes. There are ten organisms, 8 in the training set and 2 in the test set. Sequences above a set length have been excluded from this dataset

2-Objective

Create a model that classify the amino acid sequence

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time, logging, gc ,os

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from prettytable import PrettyTable
from IPython.display import Image
from sklearn.preprocessing import LabelEncoder
from keras.regularizers import l2
from keras.constraints import max_norm
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, Dropout, Flatten, Activation
from keras.layers import Conv1D, Add, MaxPooling1D, BatchNormalization
from keras.layers import Embedding, Bidirectional, GlobalMaxPooling1D,LSTM
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Activation, Dense,LSTM
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split

In [None]:
DIR = '../input/instadeep-enzyme-classification-challenge/'
train=pd.read_csv(DIR+'Train (4).csv')
test=pd.read_csv(DIR+'Test (3).csv')
sub=pd.read_csv(DIR+'SampleSubmission (3).csv')

# EDA

In [None]:
print('Train size: ', len(train))
print('Test size: ', len(test))

In [None]:
train.head(3)

In [None]:
train.describe()

In [None]:
#Check if ther'is null values
train.isnull().sum()

In [None]:
#Remove redundant samples
train=train.drop_duplicates(subset=['SEQUENCE', 'LABEL'], keep='first')

In [None]:
#example of protein sequence
train.SEQUENCE[0]

In [None]:
#length of Sequence
train.SEQUENCE.apply(lambda x : len(x)).describe()

In [None]:
# Length of sequence in train & test data.
train['seq_count']= train['SEQUENCE'].apply(lambda x: len(x))
test['seq_count']= test['SEQUENCE'].apply(lambda x: len(x))

In [None]:
def plot_seq_count(df, data_name):
  sns.distplot(df['seq_count'].values)
  plt.title(f'Sequence char count: {data_name}')
  plt.grid(True)

In [None]:
plt.subplot(1, 2, 1)
plot_seq_count(train, 'Train')

plt.subplot(1, 2, 2)
plot_seq_count(test, 'Test')

plt.subplots_adjust(right=3.0)
plt.show()

In [None]:
def get_code_freq(df, data_name):
  
  df = df.apply(lambda x: " ".join(x))
  
  codes = []
  for i in df: # concatination of all codes
    codes.extend(i)

  codes_dict= Counter(codes)
  codes_dict.pop(' ') # removing white space
  print(f'Codes: {data_name}')
  print(f'Total unique codes: {len(codes_dict.keys())}')
  df = pd.DataFrame({'Code': list(codes_dict.keys()), 'Freq': list(codes_dict.values())})
  return df.sort_values('Freq', ascending=False).reset_index()[['Code', 'Freq']]

def plot_code_freq(df, data_name):
  plt.title(f'Code frequency: {data_name}')
  sns.barplot(x='Code', y='Freq', data=df)

In [None]:
train_code_freq = get_code_freq(train['SEQUENCE'], 'Train')
train_code_freq

In [None]:
test_code_freq = get_code_freq(test['SEQUENCE'], 'Test')
test_code_freq

In [None]:
plt.subplot(1, 2, 1)
plot_code_freq(train_code_freq, 'Train')
plt.subplot(1, 2, 2)
plot_code_freq(test_code_freq, 'Test')

plt.subplots_adjust(right=3.0)
plt.show()

* amino acids  (X, U, B, Z)  are present in very less quantity
* amino acids  (B, Z) are presented only in the training set 

In [None]:
fig = plt.figure(figsize=(8,6))
train.groupby('LABEL').SEQUENCE.count().plot.bar(ylim=0)
plt.show()

* we have unbalanced classes

In [None]:
codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
         'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']#,'B',X','U','Z']

def create_dict(codes):
  char_dict = {}
  for index, val in enumerate(codes):
    char_dict[val] = index+1
  return char_dict
char_dict = create_dict(codes)

print(char_dict)
print("Dict Length:", len(char_dict))

* We will not consider amino acids that are present in very less quantity

In [None]:
def integer_encoding(data):
  """
  - Encodes code sequence to integer values.
  - 20 common amino acids are taken into consideration
    and rest 4 are categorized as 0.
  """
  
  encode_list = []
  for row in data.values:
    row_encode = []
    for code in row:
      row_encode.append(char_dict.get(code, 0))
    encode_list.append(np.array(row_encode))
  
  return encode_list

In [None]:
#apply label encoding to classes
le = LabelEncoder()
train.LABEL = le.fit_transform(train['LABEL'])

In [None]:
#Split our data into train and validation 
X_train, X_val, y_train, y_val = train_test_split(train.SEQUENCE,train.LABEL, test_size=0.2, random_state=42,stratify=train['LABEL'])

In [None]:
# encode our data
X_train = integer_encoding(X_train) 
X_val = integer_encoding(X_val) 
test_data = integer_encoding(test.SEQUENCE)

In [None]:
#we will take just the first 150 amino acid in the sequence
max_length = 150
train_pad = pad_sequences(X_train, maxlen=max_length, padding='post', truncating='post')
val_pad = pad_sequences(X_val, maxlen=max_length, padding='post', truncating='post')
test_pad = pad_sequences(test_data, maxlen=max_length, padding='post', truncating='post')

train_pad.shape, val_pad.shape, test_pad.shape

In [None]:
# One hot encoding of label
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)
y_train.shape, y_val.shape

In [None]:
# Detect hardware, return appropriate distribution strategy
print(tf.version.VERSION)
tf.get_logger().setLevel(logging.ERROR)
try: # detect TPU
    tpu = None
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError: # detect GPU(s) and enable mixed precision
    strategy = tf.distribute.MirroredStrategy() # works on GPU and multi-GPU
    policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
    tf.config.optimizer.set_jit(True) # XLA compilation
    tf.keras.mixed_precision.experimental.set_policy(policy)
    print('Mixed precision enabled')
print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
if tpu:
    BATCH_SIZE = 128 * strategy.num_replicas_in_sync
else:
    BATCH_SIZE = 64 * strategy.num_replicas_in_sync
BATCH_SIZE

In [None]:
gc.collect()

In [None]:
with strategy.scope():
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(21, 10, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, dropout=0.1,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32,dropout=0.1)),
    tf.keras.layers.Dense(20, activation='softmax')])

    model.compile(
        loss=tf.keras.losses.CategoricalCrossentropy(),
        optimizer=tf.keras.optimizers.Adam(lr=1e-3),
        metrics=['accuracy'])

model.summary()

In [None]:
es = EarlyStopping(monitor='val_accuracy',mode='max', patience=3, verbose=1)
history = model.fit(train_pad, y_train,epochs=20, batch_size=1024,validation_data=(val_pad,y_val),callbacks=[es])

In [None]:
def plot_hist(hist):
    plt.plot(hist.history["accuracy"])
    plt.plot(hist.history["val_accuracy"])
    plt.title("model accuracy")
    plt.ylabel("accuracy")
    plt.xlabel("epoch")
    plt.legend(["train", "validation"], loc="upper left")
    plt.savefig('loss.png')
    plt.show()

In [None]:
plot_hist(history)

# Evaluating Model on Validation Set

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
len(valid_labels)

In [None]:
pred_valid_y = model.predict(val_pad,  verbose = True)
pred_valid_y_labels = np.argmax(pred_valid_y, axis=-1)
pred_valid_y_labels = le.inverse_transform(pred_valid_y_labels)[:163390]
valid_labels=np.argmax(y_val, axis=-1)
valid_labels=le.inverse_transform(valid_labels)
print(classification_report(valid_labels, pred_valid_y_labels ))

In [None]:
print(confusion_matrix(valid_labels, pred_valid_y_labels ))

<p style='font-size:25px;font-weight:bold'>Please If you find this kernel helpful, upvote it to help others see it 😊</p>