In [1]:
import pandas as pd
import numpy as np

In [26]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics, svm, preprocessing

In [3]:
ls1 = []
ls2 = []
# Create consistent Field labels
colnames = ['Index', 'Fold', 'Label', 'Protein', 'PreVector']

# Create datasets on the basis of standardised input format and column names
ds1 = pd.read_csv('n-data.csv', names=colnames, header=None)
ds2 = pd.read_csv('g_data.csv', names=colnames, header=None)

for i in ds1['Protein']:
    ls1.append(" ".join(i))
    
for j in ds2['Protein']:
    ls2.append(" ".join(j))

ds1['PreVector'] = ls1
ds2['PreVector'] = ls2

In [4]:
# Input dataset classification
train, test = ds1, ds2
print(train.head())

   Index   Fold    Label                                            Protein  \
0      1  Fold1  >P76264  MNITATVLLAFGMSMDAFAASIGKGATLHKPKFSEALRTGLIFGAV...   
1      1  Fold1  >P17201  MGRLNRFRLGKDGRREQASLSRRGFLVTSLGAGVMFGFARPSSANQ...   
2      1  Fold1  >P0ABB4  MATGKIVQVIGAVVDVEFPQDAVPRVYDALEVQNGNERLVLEVQQQ...   
3      1  Fold1  >P76169  MIKTTLLFFATALCEIIGCFLPWLWLKRNASIWLLLPAGISLALFV...   
4      1  Fold1  >P08550  MVWIDYAIIAVIAFSSLVSLIRGFVREALSLVTWGCAFFVASHYYT...   

                                           PreVector  
0  M N I T A T V L L A F G M S M D A F A A S I G ...  
1  M G R L N R F R L G K D G R R E Q A S L S R R ...  
2  M A T G K I V Q V I G A V V D V E F P Q D A V ...  
3  M I K T T L L F F A T A L C E I I G C F L P W ...  
4  M V W I D Y A I I A V I A F S S L V S L I R G ...  


In [5]:
# N-gram length limits - cutoff
ngramin, ngramax = 3, 4
holderTest = []
# Maintain dataframe for feature extraction
masterTest = pd.DataFrame()
postr = []

In [6]:
# Loop for each protein in the file
for str in test.Protein: 
    # Resizable empty dictionary for the substring
    multigram = dict()
    for i in range(ngramin, ngramax):
        # Set recursive bounds between k-i and k for permuted moving window
        for k in range(i, len(str)):
            word = str[k-i:k]
            # Check for duplicates before appending
            if(word not in postr):
                postr.append(word)
            # Generate pseudo-frozen sets for the dictionary
            if(word in multigram):
                multigram[word] += 1
            else:
                multigram[word] = 1
    holderTest.append(multigram)

In [7]:
# Reset limits for cutoff
ngramin, ngramax = 3, 4
# Reset substrings
holderTrain = []
# Secondary dataframe for training data
masterTrain = pd.DataFrame()

In [8]:
# Repeat process for training data
for str in train.Protein:
    multigram = dict()
    for i in range(ngramin, ngramax):
        for k in range(i, len(str)):
            word = str[k-i:k]
            if(word not in postr):
                postr.append(word)
            if(word in multigram):
                multigram[word] += 1
            else:
                multigram[word] = 1
    holderTrain.append(multigram)

In [9]:
# Create full length dataframe with all aggregated features as columns
masterTest = pd.DataFrame(columns = postr)
# Vectorised string input read from library of test data
for vector in holderTest:
    # Start inserting data into dataframe as transposed eigenvalues
    masterTest = masterTest.append(pd.DataFrame.from_dict(vector, orient='index').T)
masterTest.head()

Unnamed: 0,MSG,SGE,GEV,EVL,VLS,LSQ,SQN,QNE,NEI,EID,...,YHC,MMC,CCP,CVC,WCW,RVU,VUH,UHG,MCW,CMN
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
0,,,,,1.0,,,,,,...,,,,,,,,,,
0,,,,,,,,,1.0,,...,,,,,,,,,,
0,,,,,,,,,,,...,,,,,,,,,,
0,,,,,1.0,,,,,,...,,,,,,,,,,


In [10]:
# Repeat process for training data
masterTrain = pd.DataFrame(columns = postr)
for vector in holderTrain:
    masterTrain = masterTrain.append(pd.DataFrame.from_dict(vector, orient='index').T)
masterTrain.head()

Unnamed: 0,MSG,SGE,GEV,EVL,VLS,LSQ,SQN,QNE,NEI,EID,...,YHC,MMC,CCP,CVC,WCW,RVU,VUH,UHG,MCW,CMN
0,,,,,,,,,,,...,,,,,,,,,,
0,,,,,,,,,,,...,,,,,,,,,,
0,,,,,1.0,1.0,,,,,...,,,,,,,,,,
0,,,,,,,,,,,...,,,,,,,,,,
0,,,,,,,,,,,...,,,,,,,,,,


In [65]:
# Create working datasets for preventing overwrites
workTrain = masterTrain.copy(deep=True)
workTest = masterTest.copy(deep=True)

In [66]:
# Housekeeping (data preprocessing) and label join
workTrain = workTrain.reset_index().drop('index', axis=1)
workTrain.insert(loc=0, column = 'Label', value=train.Fold)
workTrain = workTrain.reset_index().drop('index', axis=1).set_index('Label')

In [67]:
# More housekeeping
workTest = workTest.reset_index().drop('index', axis=1)
workTest.insert(loc=0, column = 'Label', value=test.Fold)
workTest = workTest.reset_index().drop('index', axis=1).set_index('Label')

In [68]:
# Null treatment
workTest = workTest.fillna(0)
workTest.reset_index().Label.nunique()
workTrain = workTrain.fillna(0)
workTrain.reset_index().Label.nunique()

8

In [69]:
# Test Labels
workTest.reset_index().Label.nunique()

4

### SVM

In [70]:
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import precision_recall_fscore_support

In [71]:
# Use real data this time instead of train-test split
X_train, X_test = workTrain.reset_index().drop('Label', axis=1), workTest.reset_index().drop('Label', axis=1)
y_train, y_test = workTrain.reset_index().Label, workTest.reset_index().Label

# Create an SVM Classifier instance
clf = svm.SVC(kernel='linear') # Linear Kernel

# Train the model using the instantiated datasets
clf.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.627151051625239


In [75]:
res = []
for l in [0,1,2,3]:
     prec, recall, _, _ = precision_recall_fscore_support(y_pred, y_test, pos_label=True, average=None, zero_division=0)
     res.append([l, recall[0], recall[1]])

pd.DataFrame(res,columns = ['class','sensitivity','specificity'])

Unnamed: 0,class,sensitivity,specificity
0,0,0.819549,0.205882
1,1,0.819549,0.205882
2,2,0.819549,0.205882
3,3,0.819549,0.205882


In [76]:
matthews_corrcoef(y_pred, y_test)

0.47989508271555675

In [77]:
X_train, X_test = workTrain.reset_index().drop('Label', axis=1), workTest.reset_index().drop('Label', axis=1)
y_train, y_test = workTrain.reset_index().Label, workTest.reset_index().Label

# Radial Basis function Kernel
clf = svm.SVC(kernel='rbf')

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5869980879541109


In [78]:
res = []
for l in [0,1,2,3]:
     prec, recall, _, _ = precision_recall_fscore_support(y_pred, y_test, pos_label=True, average=None, zero_division=0)
     res.append([l, recall[0], recall[1]])

pd.DataFrame(res,columns = ['class','sensitivity','specificity'])

Unnamed: 0,class,sensitivity,specificity
0,0,0.923077,0.0
1,1,0.923077,0.0
2,2,0.923077,0.0
3,3,0.923077,0.0


In [79]:
matthews_corrcoef(y_pred, y_test)

0.4234807060008344

In [80]:
X_train, X_test = workTrain.reset_index().drop('Label', axis=1), workTest.reset_index().drop('Label', axis=1)
y_train, y_test = workTrain.reset_index().Label, workTest.reset_index().Label

# Sigmoid Kernel
clf = svm.SVC(kernel='sigmoid')

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.621414913957935


In [81]:
res = []
for l in [0,1,2,3]:
     prec, recall, _, _ = precision_recall_fscore_support(y_pred, y_test, pos_label=True, average=None, zero_division=0)
     res.append([l, recall[0], recall[1]])

pd.DataFrame(res,columns = ['class','sensitivity','specificity'])

Unnamed: 0,class,sensitivity,specificity
0,0,0.912281,0.2
1,1,0.912281,0.2
2,2,0.912281,0.2
3,3,0.912281,0.2


In [82]:
matthews_corrcoef(y_pred, y_test)

0.46501909255047874

In [83]:
X_train, X_test = workTrain.reset_index().drop('Label', axis=1), workTest.reset_index().drop('Label', axis=1)
y_train, y_test = workTrain.reset_index().Label, workTest.reset_index().Label

# Polynomial Kernel
clf = svm.SVC(kernel='poly')

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.34608030592734224


In [84]:
res = []
for l in [0,1,2,3]:
     prec, recall, _, _ = precision_recall_fscore_support(y_pred, y_test, pos_label=True, average=None, zero_division=0)
     res.append([l, recall[0], recall[1]])

pd.DataFrame(res,columns = ['class','sensitivity','specificity'])

Unnamed: 0,class,sensitivity,specificity
0,0,0.341365,0.0
1,1,0.341365,0.0
2,2,0.341365,0.0
3,3,0.341365,0.0


In [86]:
matthews_corrcoef(y_pred, y_test)

0.060170738855724416

In [None]:
# Dump buffers for future reference
workTrain.to_csv('FinalisedTrain.csv')
workTest.to_csv('FinalisedTest.csv')