In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import re
import joblib
from preprocess import pfeature_process
from seq_cleanup import clean_seq
import csv
import torch
import yaml
import subprocess

In [2]:
seqs = pd.read_csv('../data/processed/generated_seqs.csv')
seqs


Unnamed: 0,sequences
0,INNTLVLLCNLFNIRYLDFCGYEKHWPYIVDGDFGYGCPRKLIPTT...
1,DENLGSEMERDCLPTDSKFNYECGSKEKQLIKMIKNCVKMNLSGYV...
2,DGAEFCRQAGGEMWSRPAVQSFPGLTTYPRNPTQFAARRNVAYLCR...
3,NIYNSDYRXPWYMEIYCRRGHRLGPRAQTRPAPCCGSTTAGPRVFP...
4,KSTEDLMDEVYKQIVPRECNNKEHXRTRGTPSQSTRVLQLDTQVQD...
...,...
2995,ASNTTVETMYEDDDHHWDREGPQPITRDYILSPPGEETTMSDVLAR...
2996,RHWMIDSWKMGVFIFPAYELNTSSNWFYPLRKQVYYKVRFWEAPNG...
2997,TGPVTQWRFTIVERHDSRKYLTYREDTTKAGTISIPQNWLYSQKQM...
2998,DEINEIEETVRQQEAQTMIVDMDSNEQSNVACGRDDENNLCQILIS...


In [3]:
print(type(seqs))
print(seqs.columns)


<class 'pandas.core.frame.DataFrame'>
Index(['sequences'], dtype='object')


In [None]:
#Clean up seceuences, uppercase, remove invalid chars, remove X from begging and end of sequence
#remove sequences with intermediary Xs
filtered_df = clean_seq(seqs)
filtered_df

Unnamed: 0,sequences
0,INNTLVLLCNLFNIRYLDFCGYEKHWPYIVDGDFGYGCPRKLIPTT...
1,DENLGSEMERDCLPTDSKFNYECGSKEKQLIKMIKNCVKMNLSGYV...
2,DGAEFCRQAGGEMWSRPAVQSFPGLTTYPRNPTQFAARRNVAYLCR...
3,DMIYRTDTIKPEGWLKWIRFKKCWKNKNCDASWKGHKIKLIDLVLR...
4,IGRLLRRMMRPIMKTCLNVGYHIPNSMEQKHCWDGLSLGWRNDIRM...
...,...
2380,KEWYKKHYLTTGNLTHDERNLWMNMMWLTEIIEKNENRCDIWEIGDCMV
2381,LSSLPWIMRLNLWTRHDAVQTGEMMNDQKPHWFNIWRHTWLGHKGQCHS
2382,DRYMEYMYGCEMYIENGDIIEMDKEEKSGEKNNFNCCVCYFDGMFWFFL
2383,TRYEIVHDYKNYPLFNDKGIIDCACPHPITSSKKHHHLSINHGQVASYY


In [None]:
#Convert cleaned dataframe to Fasta format
with open('../data/interim/filtered_seqs.fasta','w', encoding='UTF8') as f:
    
    for i in range(len(filtered_df)):
        f.write(f">{i}\n{filtered_df.iloc[i,0]}\n")

In [18]:
cd_hit_path = "../data/interim/filtered_seqs_cd_hit.txt"

In [19]:
# Process sequences with CD-HIT using WSL on windows environment
# Ensure that WSL is installed and CD-HIT is available in the WSL environment
command = ["wsl", "cd-hit", "-i", "../data/interim/filtered_seqs.fasta", "-o", cd_hit_path, "-c", "0.99"]

# Run the command
result = subprocess.run(command, capture_output=True, text=True)

# Print the output and errors
print("STDOUT:", result.stdout)
print("STDERR:", result.stderr)

Program: CD-HIT, V4.8.1 (+OpenMP), Aug 20 2021, 08:39:56
Command: cd-hit -i ../data/interim/filtered_seqs.fasta -o
         ../data/interim/filtered_seqs_cd_hit.txt -c 0.99

Started: Thu Jun 26 17:05:13 2025
                            Output                              
----------------------------------------------------------------
total seq: 2384
longest and shortest : 50 and 13
Total letters: 119033
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 0M
Buffer          : 1 X 10M = 10M
Table           : 1 X 65M = 65M
Miscellaneous   : 0M
Total           : 76M

Table limit with the given memory limit:
Max number of representatives: 4000000
Max number of word counting entries: 90456802


comparing sequences from          0  to       2384
..
     2384  finished       2384  clusters

Approximated maximum memory consumption: 77M
writing new database
writing clustering information
program completed !

Total CPU time 0.24

STDERR: w s l :   A   l o c a 

In [None]:
processed_seqs = pfeature_process(cd_hit_path,'../data/processed/pre_processed_seqs_pfeature.csv')
processed_seqs