In [4]:
from Bio import SeqIO
import gzip

# Reading the compressed FASTA file
with gzip.open('GCF_000007065.1_ASM706v1_lorfs.faa.gz', 'rt') as f:
    fasta_sequences = list(SeqIO.parse(f, 'fasta'))

In [5]:
fasta_sequences[0]

SeqRecord(seq=Seq('MKQFWVVKRDNRGLTFRILQVDFVLRNNYSGGFCIENNTSGGFCIEK'), id='NC_003901.1;1000016;1000159;+', name='NC_003901.1;1000016;1000159;+', description='NC_003901.1;1000016;1000159;+', dbxrefs=[])

In [6]:
import pandas as pd

# Creating a dataframe from  the faa file
df_lorf = pd.DataFrame(columns=['id', 'sequence'])
for fasta in fasta_sequences:
    name, sequence = fasta.id, str(fasta.seq)
    # add this row to the dataframe withouit appending
    df_lorf.loc[len(df_lorf)] = [name, sequence]

df_lorf.head()

Unnamed: 0,id,sequence
0,NC_003901.1;1000016;1000159;+,MKQFWVVKRDNRGLTFRILQVDFVLRNNYSGGFCIENNTSGGFCIEK
1,NC_003901.1;1000107;1000205;+,MDFVLRIILQVDFVLRNNYSGLLLLKRKKVKK
2,NC_003901.1;99632;100033;+,MHHIKGLAEFRPVFVRHGIERFDFRGPVDNEVMGLTCGLVNVVSDG...
3,NC_003901.1;1000225;1000383;+,MHQKTPDTLHLPSFALLTAHSALEEQTGFQSPLQIFQTFLQFLLFL...
4,NC_003901.1;1000400;1000597;+,MQFCQAFSPLKPCFFPEDFFRTDSPGVTCLDLMAALNITVFAKGLS...


In [7]:
# open 'GCF_000007065.1_ASM706v1.txt.gz' and create a dataframe from csv (ignore # lines)
parsed_data = []

try:
    with gzip.open('GCF_000007065.1_ASM706v1.txt.gz', 'rt') as file:
        # Skipping the initial header lines (first three lines)
        for _ in range(3):
            next(file)

        # Reading and parsing each line
        for line in file:
            # Splitting the line by spaces and adding it to the list
            parsed_data.append(line.split())

    # Creating a DataFrame from the parsed data
    data_manual_parsing = pd.DataFrame(parsed_data)

    # Displaying the first few rows of the manually parsed DataFrame
    display_manual_parsing = data_manual_parsing.head()
except Exception as e:
    display_manual_parsing = f"Error parsing file: {e}"

display_manual_parsing

# select the first 4 columns
df_hmmer = data_manual_parsing.iloc[:, :4]
df_hmmer.columns = ["target name", "accession", "tlen", "query name"]



In [8]:
#"target name", "accession", "tlen", "query name","accession_q"   "qlen_q"   "E-value_q"  "score_q"  "bias_q"   "#_d"  "of_d"  "c-Evalue_d"  "i-Evalue_d"  "score_d"  "bias_d"  from    to  from    to  from    to  acc description of target
# rename the 4 columns
# remove "seqid=", "Start=", and "End=" from the query name using .loc
df_hmmer.iloc[:, 3] = df_hmmer.iloc[:, 3].str.replace("Seqid=", "")
df_hmmer.iloc[:, 3] = df_hmmer.iloc[:, 3].str.replace("Start=", "")
df_hmmer.iloc[:, 3] = df_hmmer.iloc[:, 3].str.replace("End=", "")
df_hmmer.iloc[:, 3] = df_hmmer.iloc[:, 3].str.replace("Strand=", "")

df_hmmer.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value


Unnamed: 0,target name,accession,tlen,query name
0,DUF4011,PF13195.8,162,NC_003901.1;3082413;3088109;+
1,DUF4011,PF13195.8,162,NC_003901.1;3082413;3088109;+
2,MTES_1575,PF18741.3,96,NC_003901.1;3082413;3088109;+
3,AAA_12,PF13087.8,199,NC_003901.1;3082413;3088109;+
4,AAA_11,PF13086.8,261,NC_003901.1;3082413;3088109;+


In [9]:
# create a combined dataframe that looks like the df_lorf dataframe, but with the common "hmmer" where the value is true if that id also exist in the df_hmmer dataframe
df_combined = df_lorf.copy()
df_combined["hmmer"] = df_combined["id"].isin(df_hmmer["query name"])
df_combined.head()

Unnamed: 0,id,sequence,hmmer
0,NC_003901.1;1000016;1000159;+,MKQFWVVKRDNRGLTFRILQVDFVLRNNYSGGFCIENNTSGGFCIEK,True
1,NC_003901.1;1000107;1000205;+,MDFVLRIILQVDFVLRNNYSGLLLLKRKKVKK,False
2,NC_003901.1;99632;100033;+,MHHIKGLAEFRPVFVRHGIERFDFRGPVDNEVMGLTCGLVNVVSDG...,False
3,NC_003901.1;1000225;1000383;+,MHQKTPDTLHLPSFALLTAHSALEEQTGFQSPLQIFQTFLQFLLFL...,False
4,NC_003901.1;1000400;1000597;+,MQFCQAFSPLKPCFFPEDFFRTDSPGVTCLDLMAALNITVFAKGLS...,False


In [10]:
# sort by the amount of letters in the sequence
df_combined["len"] = df_combined["sequence"].str.len()
df_combined = df_combined.sort_values(by="len", ascending=False)
df_combined.head()

Unnamed: 0,id,sequence,hmmer,len
9703,NC_003901.1;3082413;3088109;+,MVDVVKELETLRQNLLDLSLRNNLLNYRHSSRRTISITGRTPEEVY...,True,1898
29375,NC_003901.1;3677242;3682422;-,MSYLTISSMLVNRMEKMKKEQSNPYSTGSGGANFETHVQAAFAVLM...,True,1726
6905,NC_003901.1;243509;248644;+,MRGLAFAEIDRTEASILCFEKALELMPEYAAAWCAMGTVAGKAERY...,True,1711
26122,NC_003901.1;2944005;2949026;-,MKVTKCNQIFGKCNQIFGGKTFNKIVGITAFVFLMLIGTAGASTFA...,True,1673
6275,NC_003901.1;2358577;2363595;+,MKHEFDALVKTISIFFAAFVLFSSFTLPVCAENEMVPGTALKNNST...,True,1672


In [11]:
# Extracting unique amino acids from the sequences
unique_amino_acids = set(''.join(df_combined['sequence']))

# Creating a mapping for each amino acid to an index
amino_acid_to_index = {amino_acid: i for i, amino_acid in enumerate(sorted(unique_amino_acids))}

# Function to integer encode a single sequence
def integer_encode_sequence(seq):
    return [amino_acid_to_index[amino_acid] for amino_acid in seq]

# Integer encoding all sequences
integer_encoded_sequences = [integer_encode_sequence(seq) for seq in df_combined['sequence']]

In [13]:
# Assuming integer_encoded_sequences is your list of sequences
max_sequence_length = max(len(seq) for seq in integer_encoded_sequences)

# Pad each sequence with zeros
padded_sequences = []
for seq in integer_encoded_sequences:
    # Calculate the number of zeros to add
    num_zeros_to_add = max_sequence_length - len(seq)
    # Extend the sequence with zeros and append it to padded_sequences
    padded_sequences.append(seq + [0] * num_zeros_to_add)

# Creating a DataFrame from the padded sequences
import pandas as pd
df_encoded = pd.DataFrame(padded_sequences)

# Displaying the first few rows of df_combined
print(df_encoded.head())

   0     1     2     3     4     5     6     7     8     9     ...  1888  \
0    10    17     2    17    17     8     3     9     3    16  ...    16   
1    10    15    19     9    16     7    15    15    10     9  ...     0   
2    10    14     5     9     0     4     0     3     7     2  ...     0   
3    10     8    17    16     8     1    11    13     7     4  ...     0   
4    10     8     6     3     4     2     0     9    17     8  ...     0   

   1889  1890  1891  1892  1893  1894  1895  1896  1897  
0    12    11     3    10     7    19     4    17     3  
1     0     0     0     0     0     0     0     0     0  
2     0     0     0     0     0     0     0     0     0  
3     0     0     0     0     0     0     0     0     0  
4     0     0     0     0     0     0     0     0     0  

[5 rows x 1898 columns]


In [14]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout

# Parameters
input_length = df_encoded.shape[1]  # Length of input sequences
num_amino_acids = len(unique_amino_acids)  # Number of unique amino acids
embedding_dim = 50  # Size of the embedding vector

# Model architecture
model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=num_amino_acids, output_dim=embedding_dim, input_length=input_length))

# LSTM layer
# You can adjust the number of units and add more LSTM layers if needed
model.add(LSTM(units=100, return_sequences=False))

# Dropout layer for regularization (optional)
model.add(Dropout(0.5))

# Dense layer for output
model.add(Dense(1, activation='sigmoid'))  # 'sigmoid' for binary classification

# Compiling the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

# Preparing the data
X = df_encoded.values  # Features
y = df_combined['hmmer'].values  # Target

# Splitting the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model
model.fit(X_train, y_train, epochs=2, batch_size=32, validation_data=(X_test, y_test))


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1898, 50)          1000      
                                                                 
 lstm (LSTM)                 (None, 100)               60400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 61,501
Trainable params: 61,501
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1b72e262548>

In [None]:
# show predictions
y_pred = model.predict(X_test)
