In [4]:
#IMPORTING THE REQUIRED LIBRARIES

import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import collections
import csv
from sklearn.svm import SVC

#ENCODING AMINO ACIDS
#Declaring custom encoder
label_encoder = LabelEncoder()
AA_sequence_loop = {char: i  for i, char in enumerate('ACDEFGHIKLMNPQRSTVWY')}

#Loading training data
train_data=pd.read_csv("train.csv")

#Applying Term Frequency-Inverse Document Frequency to dataset
tfidf_list=[]

#Looping training data sequence
for data in train_data["Sequence"]:
  AA_characters=len(data)
  AA_occurence = collections.Counter(data)
  #Creating TF-IDF like scores
  tf_idf_scores = {letter: count  for letter, count in AA_occurence.items()}
    #Appending score dictionary to list
  tfidf_list.append(tf_idf_scores)

#Finalising the encoding sequence based on TF_IDF like scores
final_list=[]
for idf in tfidf_list:
  #Creating a list with 21 zeroes
  temp_list=[0]*21
  for i in idf:
    #Adding count to index with corresponding labels
    temp_list[AA_sequence_loop[i]]=idf[i]
  final_list.append(temp_list)

#Converting sequences and labels as numpy arrays
Sequences=np.array(final_list)
Labels=np.array(train_data["Label"])

#Applying SMOTE to the given dataset
smote=SMOTE(random_state=42)
X_sample,y_resample=smote.fit_resample(Sequences,Labels)

#Splitting the data into training and testing samples
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_resample, test_size=0.3, random_state=42)

#Declaring the classifier, ie RF classifier with the various hyperparameters
rf_classifier = RandomForestClassifier(n_estimators=60, max_depth=None, min_samples_split=2)

#Fitting the training data with the testing data
rf_classifier.fit(X_train, y_train)

#Predicting the output values for the testing sequences
y_pred = rf_classifier.predict(X_test)

# Calculating the accuracy of out model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy of the model : {accuracy*100}")

#Reading the original test dataset
test_data=pd.read_csv("test.csv")

#Applying the TF_IDF on test data
test_tfidf_list=[]
for data in test_data["Sequence"]:
  AA_characters=len(data)
  AA_occurence = collections.Counter(data)
    #Creating TF_IDF Scores
  tf_idf_scores = {letter: count  for letter, count in AA_occurence.items()}
   #Appending score dictionary to list
  test_tfidf_list.append(tf_idf_scores)

#Converting TF-IDF-like scores into an actual test sequence
test_final_list=[]
for idf in test_tfidf_list:
  #Creating a list with 21 zeroes
  temp_list=[0]*21
  for i in idf:
 #using dictionary to find its corresponding index
    temp_list[AA_sequence_loop[i]]=idf[i]
  test_final_list.append(temp_list)

#Using the trained Random Forest classifier to predict labels for the test data
final_pred=rf_classifier.predict(test_final_list)

#Saving the results to csv file
with open('./output/final_result.csv', mode='w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(['ID', 'Label'])
  for i in range(0,len(test_data["ID"])):
    writer.writerow([test_data["ID"][i],final_pred[i]])

Test Accuracy of the model : 90.84194977843427
