# This notebook uses the list of non_redundant sequences indentified by protein seq to generate a filtered set of data based on mergedTables.csv

### Setting up notebook

In [1]:
import csv
import pandas as pd

In [2]:
set_path = 'seq_list_90.txt'

### Generating updated mergedTables by filtering

In [None]:
# Retrieving set of filtered protein ids

with open(set_path, 'r') as file:
    seq_list = [line.strip() for line in file]

# Turn into a set for faster lookup
seq_set = set(seq_list)

# Iterate so RAM does not run out
iterator = pd.read_csv('unfilteredData/mergedTables.csv', chunksize=5000)


is_first_chunk = True

# Iterate by chunks
for chunk in iterator:

    rows = []
    
    # Iterate through each row in chunk
    for index, row in chunk.iterrows():

        # If protein_id is a representative sequence add to rows
        if row['protein_seq'] in seq_set:
            rows.append(row)
            seq_set.remove(row['protein_seq'])

    # convert list to pandas dataframe
    df = pd.DataFrame(rows, columns=['genome_id', 'protein_id', 'protein_seq', 'taxonomic_label','host_id','embedding'])

    
    if is_first_chunk:
            # For the first chunk, write in 'w' mode (overwrite) and include the header.
            df.to_csv("mergedTables90.csv", mode='w', header=True, index=False)
            is_first_chunk = False
    else:
            # For subsequent chunks, append in 'a' mode and omit the header.
            df.to_csv("mergedTables90.csv", mode='a', header=False, index=False)
    

    print(f"Processed and wrote a chunk of {len(chunk)} rows.")

print("COMPLETED!")



Processed and wrote a chunk of 5000 rows.
Processed and wrote a chunk of 5000 rows.
Processed and wrote a chunk of 5000 rows.
Processed and wrote a chunk of 5000 rows.
Processed and wrote a chunk of 5000 rows.
Processed and wrote a chunk of 5000 rows.
Processed and wrote a chunk of 5000 rows.
Processed and wrote a chunk of 5000 rows.
Processed and wrote a chunk of 5000 rows.
Processed and wrote a chunk of 5000 rows.
Processed and wrote a chunk of 5000 rows.
Processed and wrote a chunk of 5000 rows.
Processed and wrote a chunk of 5000 rows.
Processed and wrote a chunk of 5000 rows.
Processed and wrote a chunk of 5000 rows.
Processed and wrote a chunk of 5000 rows.


In [5]:
pd.read_csv('mergedTables80.csv', nrows=1000)


Unnamed: 0,genome_id,protein_id,protein_seq,taxonomic_label,host_id,embedding
0,GCF_009811375.1,WP_159277441.1,MQFEEGIAWGVQDGIGRIVLKRPERANSVSLASSHALARAIDEVLE...,Variovorax boronicumulans,enoyl-CoA hydratase/isomerase family protein,"[-0.0007136122, -0.15305841, 0.029148007, -0.0..."
1,GCF_900112575.1,WP_091493282.1,MGIRLKTIRIKGFRGFKNIEVDFQNTTVLVGTNNAGKTTLLKALQV...,Flavobacterium phragmitis,ATP-dependent endonuclease,"[0.059893496, 0.029295022, 0.04714787, 0.07690..."
2,GCF_022701015.1,WP_173598177.1,MKVAIMLFDGITALDAIGPYDVFAATLQWEVKFVAKQKGLVKLDSN...,Brevibacillus,MULTISPECIES: DJ-1/PfpI family protein,"[0.07244332, -0.079110704, 0.038540285, -0.005..."
3,GCF_029223485.1,WP_275781621.1,MVLTLSAVRWAGEVPKFYRDDCPVPIFAPNSMPTQEDNKVATALGF...,Streptomyces coacervatus,hypothetical protein,"[-0.05294113, 0.07054596, 0.046061877, -0.0093..."
4,GCF_039532985.1,WP_344363376.1,MGTGTEGFAEALRGLKERSGLSYGALAKRAHMSTSTLHRYCNGDAV...,Streptomyces gobitricini,helix-turn-helix domain-containing protein,"[0.0003043313, -0.09910503, -0.034521643, -0.0..."
...,...,...,...,...,...,...
995,GCF_039530525.1,WP_346162198.1,MKGLTARQQQVLLLAADGNTNVQIAARLQISSHTVAEVLTAAYRTL...,Streptomyces bangladeshensis,helix-turn-helix transcriptional regulator,"[0.022992581, -0.031461056, 0.03443404, -0.087..."
996,GCF_012225885.1,WP_167871710.1,MKKLSKIFYSHAALLLLALLLAGCTGSKDEESKQPSKHDAAHSDKN...,Bacillus tequilensis,hypothetical protein,"[-0.0100655835, 0.101813376, -0.07615985, 0.09..."
997,GCF_004135675.1,WP_165973918.1,MNKEKYQLHTKEISINIVQTEINSIRRKDILKTGIRIYKDGKIGVA...,Marinitoga lauensis,metallopeptidase TldD-related protein,"[0.07139584, 0.01657422, -0.02588144, 0.088972..."
998,GCF_001507595.1,WP_068343317.1,MCHVAVIRALSPLNRDANAAILLIDADPPAIPDAGAMGQDIYAGGF...,Ruegeria marisrubri,hypothetical protein,"[-0.023200467, 0.035209462, 0.038499985, -0.02..."
