In [43]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import  precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve


In [44]:
df=pd.read_csv('./db.csv')
df=df.iloc[:111]

In [45]:
df=df.drop(['Entry','Reviewed', 'Entry Name', 'Gene Names', 'Gene Ontology (biological process)', 'Gene Ontology (molecular function)'], axis=1)

In [46]:
df

Unnamed: 0,Protein names,Organism,Length,Sequence
0,Truncated endolysin,Staphylococcus phage 812,284,MAKTQAEINKRLDAYAKGTVDSPYRVKKATSYDPSFGVMEAGAIDA...
1,Putative holin,Staphylococcus phage S24-1,140,MNEVKLRFTDTEAFHMFIYAGDLKLLYFLFVLMIVDIVTGFAKAIK...
2,Putative holin,Staphylococcus phage S13',140,MNEVKLRFTDTEAFHMFIYAGDLKLLYFLFVLMIVDIVTGFAKAIK...
3,Putative holin,Staphylococcus phage S25-3,167,MANETKQPKVVGGINLSTRTKSKTFWVAIISAVALFANQITGAFGL...
4,Putative holin,Staphylococcus phage S25-4,167,MANETKQPKVVGGINLSTRTKSKTFWVAIISAVALFANQITGAFGL...
...,...,...,...,...
106,Bifunctional endolysin,Staphylococcus phage SpT152,629,MALPKTGKPTAKQVADWAISLIGSGVDVDGYYGRQCWDLSNYIFNR...
107,Bifunctional endolysin,Staphylococcus phage vB_SpsM_WIS42,629,MALPKTGKPTAKQVVDWAINLIGSGVDVDGYYGRQCWDLPNYIFNR...
108,Bi-functional endolysin,Staphylococcus phage Sebago,624,MGLPNPKTRKPTASEVAEWAKSNIGKRINIDNYRGSQCWDTPNYIF...
109,Bacteriophage phi 11 holin homologue (ORF3),Staphylococcus phage phi11 (Bacteriophage phi-11),57,MEGNFKNVKKFIYEGEEYTKVYAGNIQVWKKPSSFVIKPLPKNKYP...


In [47]:
for name in df['Protein names']:
    df['Protein names'] = np.where(df['Protein names'].str.contains('endolysin'), 'endolysin', df['Protein names'])
    df['Protein names'] = np.where(df['Protein names'].str.contains('Endolysin'), 'endolysin', df['Protein names'])
    df['Protein names'] = np.where(df['Protein names'].str.contains('Holin'), 'holin', df['Protein names'])
    df['Protein names'] = np.where(df['Protein names'].str.contains('holin'), 'holin', df['Protein names'])


        

In [48]:
proteins=df

In [49]:
df

Unnamed: 0,Protein names,Organism,Length,Sequence
0,endolysin,Staphylococcus phage 812,284,MAKTQAEINKRLDAYAKGTVDSPYRVKKATSYDPSFGVMEAGAIDA...
1,holin,Staphylococcus phage S24-1,140,MNEVKLRFTDTEAFHMFIYAGDLKLLYFLFVLMIVDIVTGFAKAIK...
2,holin,Staphylococcus phage S13',140,MNEVKLRFTDTEAFHMFIYAGDLKLLYFLFVLMIVDIVTGFAKAIK...
3,holin,Staphylococcus phage S25-3,167,MANETKQPKVVGGINLSTRTKSKTFWVAIISAVALFANQITGAFGL...
4,holin,Staphylococcus phage S25-4,167,MANETKQPKVVGGINLSTRTKSKTFWVAIISAVALFANQITGAFGL...
...,...,...,...,...
106,endolysin,Staphylococcus phage SpT152,629,MALPKTGKPTAKQVADWAISLIGSGVDVDGYYGRQCWDLSNYIFNR...
107,endolysin,Staphylococcus phage vB_SpsM_WIS42,629,MALPKTGKPTAKQVVDWAINLIGSGVDVDGYYGRQCWDLPNYIFNR...
108,endolysin,Staphylococcus phage Sebago,624,MGLPNPKTRKPTASEVAEWAKSNIGKRINIDNYRGSQCWDTPNYIF...
109,holin,Staphylococcus phage phi11 (Bacteriophage phi-11),57,MEGNFKNVKKFIYEGEEYTKVYAGNIQVWKKPSSFVIKPLPKNKYP...


In [50]:
df['Protein names'].unique()

array(['endolysin', 'holin'], dtype=object)

In [51]:
import itertools
sequences=df['Sequence']
def calculate_ngram_frequencies(sequence, n):
    ngram_freq = {}
    for i in range(len(sequence) - n + 1):
        ngram = sequence[i:i+n]
        ngram_freq[ngram] = ngram_freq.get(ngram, 0) + 1
    return ngram_freq

# Defining the function to create feature vectors
def create_feature_vector(row, n, feature_vector):
    sequence = row['Sequence']
    ngram_freq = calculate_ngram_frequencies(sequence, n)
    # Initializing the feature vector with zeros
    vector = feature_vector.copy()
    # Updating the feature vector with the frequency of each n-gram
    for ngram, freq in ngram_freq.items():
        vector[ngram] = freq
    return pd.Series(vector)

data = {'Sequence': df['Sequence'].tolist()}  # Extracting the 'Sequence' column as a list

# Converting the list of sequences to a dictionary with key 'Sequence'
data_dict = {'Sequence': data['Sequence']}

datF = pd.DataFrame(data_dict)
# Defining the value of n for n-grams
n = 3

# Generating the feature vector template
amino_acids = "ACDEFGHIKLMNPQRSTVWY"  # A string representing the 20 standard amino acids
feature_vector = {"".join(aa): 0 for aa in itertools.product(amino_acids, repeat=n)}

# Applying the function to create feature vectors for each row
feature_vectors = datF.apply(lambda row: create_feature_vector(row, n, feature_vector), axis=1)

# Combining sequence and feature vectors into a single DataFrame
result_df = pd.concat([datF['Sequence'], feature_vectors], axis=1)
df = pd.concat([df, result_df.drop(columns=['Sequence'])], axis=1, join='inner')



In [52]:
df

Unnamed: 0,Protein names,Organism,Length,Sequence,AAA,AAC,AAD,AAE,AAF,AAG,...,YYM,YYN,YYP,YYQ,YYR,YYS,YYT,YYV,YYW,YYY
0,endolysin,Staphylococcus phage 812,284,MAKTQAEINKRLDAYAKGTVDSPYRVKKATSYDPSFGVMEAGAIDA...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,holin,Staphylococcus phage S24-1,140,MNEVKLRFTDTEAFHMFIYAGDLKLLYFLFVLMIVDIVTGFAKAIK...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,holin,Staphylococcus phage S13',140,MNEVKLRFTDTEAFHMFIYAGDLKLLYFLFVLMIVDIVTGFAKAIK...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,holin,Staphylococcus phage S25-3,167,MANETKQPKVVGGINLSTRTKSKTFWVAIISAVALFANQITGAFGL...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,holin,Staphylococcus phage S25-4,167,MANETKQPKVVGGINLSTRTKSKTFWVAIISAVALFANQITGAFGL...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,endolysin,Staphylococcus phage SpT152,629,MALPKTGKPTAKQVADWAISLIGSGVDVDGYYGRQCWDLSNYIFNR...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
107,endolysin,Staphylococcus phage vB_SpsM_WIS42,629,MALPKTGKPTAKQVVDWAINLIGSGVDVDGYYGRQCWDLPNYIFNR...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
108,endolysin,Staphylococcus phage Sebago,624,MGLPNPKTRKPTASEVAEWAKSNIGKRINIDNYRGSQCWDTPNYIF...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
109,holin,Staphylococcus phage phi11 (Bacteriophage phi-11),57,MEGNFKNVKKFIYEGEEYTKVYAGNIQVWKKPSSFVIKPLPKNKYP...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
df=df.drop(['Sequence', 'Organism'], axis=1)

In [54]:
df

Unnamed: 0,Protein names,Length,AAA,AAC,AAD,AAE,AAF,AAG,AAH,AAI,...,YYM,YYN,YYP,YYQ,YYR,YYS,YYT,YYV,YYW,YYY
0,endolysin,284,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,holin,140,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,holin,140,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,holin,167,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,holin,167,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,endolysin,629,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
107,endolysin,629,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
108,endolysin,624,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
109,holin,57,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
enc=OneHotEncoder(sparse=False, handle_unknown='ignore')
df['Protein names']=enc.fit_transform(np.array(df['Protein names']).reshape(-1,1))



In [56]:
df['Protein names'].unique()

array([1., 0.])

In [57]:
df

Unnamed: 0,Protein names,Length,AAA,AAC,AAD,AAE,AAF,AAG,AAH,AAI,...,YYM,YYN,YYP,YYQ,YYR,YYS,YYT,YYV,YYW,YYY
0,1.0,284,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,140,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,140,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,167,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,167,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,1.0,629,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
107,1.0,629,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
108,1.0,624,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
109,0.0,57,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
x=df.drop('Protein names', axis=1)
y=df['Protein names']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=42)


In [59]:
x.shape

(111, 8001)

In [60]:
svc=SVC()
svc.fit(X_train, y_train)
y_pred=svc.predict(X_test)

In [61]:
print(accuracy_score(y_test,y_pred))

0.9333333333333333


In [62]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred)
pr_auc = auc(recall_vals, precision_vals)

# Print other metrics
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC AUC: {roc_auc:.2f}")
print(f"Precision-Recall AUC: {pr_auc:.2f}")


Precision: 0.93
Recall: 1.00
F1 Score: 0.96
ROC AUC: 0.62
Precision-Recall AUC: 0.97


In [63]:
prediction_sequence="MKTKKQALKWILNTIGQGIDWDKMYGFQCMDLVVAYLYYVTDGKIAMWGNAIDAPKNNFKGTAKVIKNYPAFRPEEGDIVVWSYGNFSTYGHIAVVIDGDPYGDLQYITVAEQNWNGLGLYKQEVTTKRIHNYDGVSHFIRPKFKKTVKKEDNTLTKEKNNKKTKGKKLKVSTQRINYTMDKRGYKPKFVVIHNDAGSSSAQQYEQGLKNAGYSRYAQGVAHAYASDGYVWEAISEDRIAWHTGDGTNPGTGNFEGYGIEVCQSLGDRNTFLKNEQTVFQFIAEKLQKWNLPANRNTIRLHNEFIQTECPHASAYYHAGMNTKVDAYTKERQLKIKDYFIKQIRAYMKGSTPKSTVVKSSKSSGSLPKKKGKKQTSKSNIGKTFDFNGLSTNVWGTKWYYENNTFTCNARQGIITRVGSPFTTAPQAGVLFYGQTVTYNQVAVNPKEPFVWISWITNNGTEVWMPIEVLDSNNKIIEQWGTFGW"
seq_length=len(prediction_sequence)
def prediction_feature_vector(sequence, n):
    ngram_freq = calculate_ngram_frequencies(sequence, n)
    vector = feature_vector.copy()
    # Updating the feature vector with the frequency of each n-gram
    for ngram, freq in ngram_freq.items():
        vector[ngram] = freq
    return pd.Series(vector)
pred_vec=(prediction_feature_vector(prediction_sequence, 3))


In [64]:
pred_vec=pd.DataFrame(np.array(pred_vec).reshape(1,-1))
length=pd.DataFrame(np.array([seq_length]))
pred_vec=pd.concat([pred_vec, length], axis="columns")

In [65]:
pred_vec.shape

(1, 8001)

In [66]:
result=svc.predict(pred_vec)
print(result)

[1.]




In [67]:
if result[0]==1:
    protein="endolysin"
else:
    protein="holin"
print(protein)

endolysin


In [68]:
a=proteins.loc[proteins['Protein names']==protein]

In [69]:
sequences_list=a['Sequence']

In [70]:
sequences_list=sequences_list.to_numpy()

In [71]:
sequences_list

array(['MAKTQAEINKRLDAYAKGTVDSPYRVKKATSYDPSFGVMEAGAIDADGYYHAQCQDLITDYVLWLTDNKVRTWGNAKDQIKQSYGTGFKIHENKPSTVPKKGWIAVFTSGSYEQWGHIGIVYDGGNTSTFTILEQNWNGYANKKPTKRVDNYYGLTHFIEIPVKAGTTVKKETAKKSASTPATRPVTGSWKKNQYGTWYKPENATFVNGNQPIVTRIGSPFLNAPVGGNLPAGATIVYDEVCIQAGHIWIGYNAYNGNRVYCPVRTCQGVPPNQIPGVAWGVFK',
       'MSASDAQFLKNEQAVFQFTAEKFKEWGLTPNRKTVRLHMEFVPTACPHRSMVLHTGFNPVTQGRPSQAIMNKLKDYFIKQIKNYMDKGTSSSTVVKDGKTSSASTPATRPVTGSWKKNQYGTWYKPENATFVNGNQPIVTRIGSPFLNAPVGGNLPAGATIVYDEVCIQAGHIWIGYNAYNGNRVYCPVRTCQGVPPNQIPGVAWGVFK',
       'MAKTQAEINKRLDAYAKGTVDSPYRVKKATSYDPSFGVMEAGAIDADGYYHAQCQDLITDYVLWLTDNKVRTWGNAKDQIKQSYGTGFKIHENKPSTVPKKGWIAVFTSGSYEQWGHIGIVYDGGNTSTFTILEQNWNGYANKKPTKRVDNYYGLTHFIEIPVKAGTTVKKETAKKSASKTPAPKKKATLKVSKNHINYTMDKRGKKPEGMVIHNDAGRSSGQQYENSLANAGYARYANGIAHYYGSEGYVWEAIDAKNQIAWHTGK',
       'MSASDAQFLKNEQAVFQFTAEKFKEWGLTPNRKTVRLHMEFVPTACPHRSMVLHTGFNPVTQGRPSQAIMNKLKDYFIKQIKNYMDKGTSSSTVVKDGKTSSASTPATRPVTGSWKKNQYGTWYKPENATFVNGNQPIVTRIGSPFLNAPVGGNLPAGATIVYDEVCIQAGHIWIGYNAYNGNRVYCPVRTCQGVPPNQ

In [72]:
def find_longest_common_substring(string1, string2):
    # Create a 2D array to store the length of the longest common suffixes
    len1 = len(string1)
    len2 = len(string2)
    max_length = 0
    end_index = len1
    lcs_suffix = [[0 for _ in range(len2 + 1)] for _ in range(len1 + 1)]
    
    # Build the LCSuffix table
    for i in range(1, len1 + 1):
        for j in range(1, len2 + 1):
            if string1[i - 1] == string2[j - 1]:
                lcs_suffix[i][j] = lcs_suffix[i - 1][j - 1] + 1
                if lcs_suffix[i][j] > max_length:
                    max_length = lcs_suffix[i][j]
                    end_index = i
            else:
                lcs_suffix[i][j] = 0
    
    # The longest common substring
    longest_common_substr = string1[end_index - max_length: end_index]
    
    return longest_common_substr
string1=sequences_list[0]
string2=sequences_list[1]
find_longest_common_substring(string1, string2)


'SASTPATRPVTGSWKKNQYGTWYKPENATFVNGNQPIVTRIGSPFLNAPVGGNLPAGATIVYDEVCIQAGHIWIGYNAYNGNRVYCPVRTCQGVPPNQIPGVAWGVFK'