In [4]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
data=pd.read_csv('./data.csv')

In [5]:
data

Unnamed: 0,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Gene Ontology IDs,Sequence
0,A0A7G9VYQ1,unreviewed,A0A7G9VYQ1_BPACA,Putative holin,Aristophanes_00042,Acinetobacter phage Aristophanes,127,GO:0016020,MNFIDQVYLVLVYTWSELSKNAHVLMGAILAVTISYLKTHKEKREQ...
1,A0A2H5BHG1,unreviewed,A0A2H5BHG1_BPSHA,Holin,SHab15497_00003,Acinetobacter phage SH-Ab 15497,87,GO:0016020,MRQHTSNQPIKLSRAVLEAVASGFTGVIAMFICYAIHLPWYWAGAF...
2,O48430,unreviewed,O48430_BPH19,Putative holin protein,S,Enterobacteria phage H19B (Bacteriophage H19B),68,GO:0001907; GO:0016020; GO:0140911,MEKITTGVSYTTSAVGTGYWLLQLLDKVSPSQWVAIGVLGSLLFGL...
3,Q9AZ07,unreviewed,Q9AZ07_BPHK6,Holin,S,Enterobacteria phage HK620 (Bacteriophage HK620),107,GO:0016020,MKMPEKNDLLAAILAAKEQGIGAILAFAMAYLRGRYNGGAFTKTVI...
4,Q9MCN5,unreviewed,Q9MCN5_BPHK7,Holin,70,Enterobacteria phage HK97 (Bacteriophage HK97),106,GO:0016020,MKMPEKMTVAAILAAKEQGIGAILAFAMAYLRGRYNGGAFTKTVID...
...,...,...,...,...,...,...,...,...,...
178,Q08JW9,unreviewed,Q08JW9_BPMR1,Endolysin,65 orf65,Staphylococcus phage phiMR11,481,GO:0008745; GO:0009253; GO:0042742; GO:0044659,MQAKLTKKEFIEWLKTSEGKQFNVDLWYGFQCFDYANAGWKVLFGL...
179,G9M969,unreviewed,G9M969_BPPSP,Putative endolysin,,Staphylococcus phage S13',253,,MKSQQQAKEWIYKHEGVGVDFDLAFGYQCADLAVAYIYYITDGKVR...
180,G9M948,unreviewed,G9M948_BPPS4,Putative endolysin,,Staphylococcus phage S24-1,250,,MKSQQQAKEWLYNHEGVGIDFDLAFGYQCADLAVAYIYYITDGKVR...
181,V5XX66,unreviewed,V5XX66_BPS25,Hypothetical endolysin,,Staphylococcus phage S25-3,495,GO:0008745; GO:0009253; GO:0031640; GO:0042742,MAKTQAEINKRLDAYAKGTVDSPYRVKKATSYDPSFGVMEAGAIDA...


In [6]:
import itertools
sequences=data['Sequence']
def calculate_ngram_frequencies(sequence, n):
    ngram_freq = {}
    for i in range(len(sequence) - n + 1):
        ngram = sequence[i:i+n]
        ngram_freq[ngram] = ngram_freq.get(ngram, 0) + 1
    return ngram_freq

# Defining the function to create feature vectors
def create_feature_vector(row, n, feature_vector):
    sequence = row['Sequence']
    ngram_freq = calculate_ngram_frequencies(sequence, n)
    # Initializing the feature vector with zeros
    vector = feature_vector.copy()
    # Updating the feature vector with the frequency of each n-gram
    for ngram, freq in ngram_freq.items():
        vector[ngram] = freq
    return pd.Series(vector)

df = {'Sequence': data['Sequence'].tolist()}  # Extracting the 'Sequence' column as a list

# Converting the list of sequences to a dictionary with key 'Sequence'
data_dict = {'Sequence': df['Sequence']}

datF = pd.DataFrame(data_dict)
# Defining the value of n for n-grams
n = 3

# Generating the feature vector template
amino_acids = "ACDEFGHIKLMNPQRSTVWY"  # A string representing the 20 standard amino acids
feature_vector = {"".join(aa): 0 for aa in itertools.product(amino_acids, repeat=n)}

# Applying the function to create feature vectors for each row
feature_vectors = datF.apply(lambda row: create_feature_vector(row, n, feature_vector), axis=1)

# Combining sequence and feature vectors into a single DataFrame
result_df = pd.concat([datF['Sequence'], feature_vectors], axis=1)
data = pd.concat([data, result_df.drop(columns=['Sequence'])], axis=1, join='inner')


In [7]:
data

Unnamed: 0,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Gene Ontology IDs,Sequence,AAA,...,YYM,YYN,YYP,YYQ,YYR,YYS,YYT,YYV,YYW,YYY
0,A0A7G9VYQ1,unreviewed,A0A7G9VYQ1_BPACA,Putative holin,Aristophanes_00042,Acinetobacter phage Aristophanes,127,GO:0016020,MNFIDQVYLVLVYTWSELSKNAHVLMGAILAVTISYLKTHKEKREQ...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A0A2H5BHG1,unreviewed,A0A2H5BHG1_BPSHA,Holin,SHab15497_00003,Acinetobacter phage SH-Ab 15497,87,GO:0016020,MRQHTSNQPIKLSRAVLEAVASGFTGVIAMFICYAIHLPWYWAGAF...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,O48430,unreviewed,O48430_BPH19,Putative holin protein,S,Enterobacteria phage H19B (Bacteriophage H19B),68,GO:0001907; GO:0016020; GO:0140911,MEKITTGVSYTTSAVGTGYWLLQLLDKVSPSQWVAIGVLGSLLFGL...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Q9AZ07,unreviewed,Q9AZ07_BPHK6,Holin,S,Enterobacteria phage HK620 (Bacteriophage HK620),107,GO:0016020,MKMPEKNDLLAAILAAKEQGIGAILAFAMAYLRGRYNGGAFTKTVI...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Q9MCN5,unreviewed,Q9MCN5_BPHK7,Holin,70,Enterobacteria phage HK97 (Bacteriophage HK97),106,GO:0016020,MKMPEKMTVAAILAAKEQGIGAILAFAMAYLRGRYNGGAFTKTVID...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,Q08JW9,unreviewed,Q08JW9_BPMR1,Endolysin,65 orf65,Staphylococcus phage phiMR11,481,GO:0008745; GO:0009253; GO:0042742; GO:0044659,MQAKLTKKEFIEWLKTSEGKQFNVDLWYGFQCFDYANAGWKVLFGL...,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
179,G9M969,unreviewed,G9M969_BPPSP,Putative endolysin,,Staphylococcus phage S13',253,,MKSQQQAKEWIYKHEGVGVDFDLAFGYQCADLAVAYIYYITDGKVR...,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
180,G9M948,unreviewed,G9M948_BPPS4,Putative endolysin,,Staphylococcus phage S24-1,250,,MKSQQQAKEWLYNHEGVGIDFDLAFGYQCADLAVAYIYYITDGKVR...,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
181,V5XX66,unreviewed,V5XX66_BPS25,Hypothetical endolysin,,Staphylococcus phage S25-3,495,GO:0008745; GO:0009253; GO:0031640; GO:0042742,MAKTQAEINKRLDAYAKGTVDSPYRVKKATSYDPSFGVMEAGAIDA...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
#code for One-Hot encoding
encoder = OneHotEncoder(handle_unknown='ignore')  # Ignore unknown characters
encoded_sequences = encoder.fit_transform(sequences.values.reshape(-1, 1))
encoded_features = encoded_sequences.toarray()


<bound method DataFrame.info of           Entry    Reviewed        Entry Name           Protein names  \
0    A0A7G9VYQ1  unreviewed  A0A7G9VYQ1_BPACA          Putative holin   
1    A0A2H5BHG1  unreviewed  A0A2H5BHG1_BPSHA                   Holin   
2        O48430  unreviewed      O48430_BPH19  Putative holin protein   
3        Q9AZ07  unreviewed      Q9AZ07_BPHK6                   Holin   
4        Q9MCN5  unreviewed      Q9MCN5_BPHK7                   Holin   
..          ...         ...               ...                     ...   
178      Q08JW9  unreviewed      Q08JW9_BPMR1               Endolysin   
179      G9M969  unreviewed      G9M969_BPPSP      Putative endolysin   
180      G9M948  unreviewed      G9M948_BPPS4      Putative endolysin   
181      V5XX66  unreviewed      V5XX66_BPS25  Hypothetical endolysin   
182      V5XWH3  unreviewed      V5XWH3_BPS24      Putative endolysin   

             Gene Names                                          Organism  \
0    Aristopha

In [15]:
data

Unnamed: 0,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Gene Ontology IDs,Sequence,AAA,...,YYM,YYN,YYP,YYQ,YYR,YYS,YYT,YYV,YYW,YYY
0,A0A7G9VYQ1,unreviewed,A0A7G9VYQ1_BPACA,Putative holin,Aristophanes_00042,Acinetobacter phage Aristophanes,127,GO:0016020,MNFIDQVYLVLVYTWSELSKNAHVLMGAILAVTISYLKTHKEKREQ...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A0A2H5BHG1,unreviewed,A0A2H5BHG1_BPSHA,Holin,SHab15497_00003,Acinetobacter phage SH-Ab 15497,87,GO:0016020,MRQHTSNQPIKLSRAVLEAVASGFTGVIAMFICYAIHLPWYWAGAF...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,O48430,unreviewed,O48430_BPH19,Putative holin protein,S,Enterobacteria phage H19B (Bacteriophage H19B),68,GO:0001907; GO:0016020; GO:0140911,MEKITTGVSYTTSAVGTGYWLLQLLDKVSPSQWVAIGVLGSLLFGL...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Q9AZ07,unreviewed,Q9AZ07_BPHK6,Holin,S,Enterobacteria phage HK620 (Bacteriophage HK620),107,GO:0016020,MKMPEKNDLLAAILAAKEQGIGAILAFAMAYLRGRYNGGAFTKTVI...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Q9MCN5,unreviewed,Q9MCN5_BPHK7,Holin,70,Enterobacteria phage HK97 (Bacteriophage HK97),106,GO:0016020,MKMPEKMTVAAILAAKEQGIGAILAFAMAYLRGRYNGGAFTKTVID...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,Q08JW9,unreviewed,Q08JW9_BPMR1,Endolysin,65 orf65,Staphylococcus phage phiMR11,481,GO:0008745; GO:0009253; GO:0042742; GO:0044659,MQAKLTKKEFIEWLKTSEGKQFNVDLWYGFQCFDYANAGWKVLFGL...,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
179,G9M969,unreviewed,G9M969_BPPSP,Putative endolysin,,Staphylococcus phage S13',253,,MKSQQQAKEWIYKHEGVGVDFDLAFGYQCADLAVAYIYYITDGKVR...,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
180,G9M948,unreviewed,G9M948_BPPS4,Putative endolysin,,Staphylococcus phage S24-1,250,,MKSQQQAKEWLYNHEGVGIDFDLAFGYQCADLAVAYIYYITDGKVR...,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
181,V5XX66,unreviewed,V5XX66_BPS25,Hypothetical endolysin,,Staphylococcus phage S25-3,495,GO:0008745; GO:0009253; GO:0031640; GO:0042742,MAKTQAEINKRLDAYAKGTVDSPYRVKKATSYDPSFGVMEAGAIDA...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
