In [1]:
# This file serves to help filter out data we don't want from the RefSeq Database
# and obtain a list of accessions that abide by some criteria. This criteria includes
# 1. organism is present in k2 database, so kraken2 is aware of the organism and able to classify it
# 2. accession is NOT present in k2 database, so kraken2 is not overfit for this particular sequence

In [2]:
# load in RefSeq as pandas dataframe
import pandas as pd
file = "/scratch/j/jparkin/wongkoji/assembly_summary.txt"
df = pd.read_csv(filepath_or_buffer=file,
            sep='\t',
            skiprows=1)

In [3]:
df

Unnamed: 0,#assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,isolate,...,replicon_count,scaffold_count,contig_count,annotation_provider,annotation_name,annotation_date,total_gene_count,protein_coding_gene_count,non_coding_gene_count,pubmed_id
0,GCF_036600855.1,PRJNA224116,SAMN38772065,JBAFXD000000000.1,na,7,7,Azorhizobium caulinodans,strain=CNM20190194,na,...,0,171,171,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,2024/02/14,5320,5174,58,na
1,GCF_036600915.1,PRJNA224116,SAMN38772067,JBAFXF000000000.1,na,7,7,Azorhizobium caulinodans,strain=CNM20220104,na,...,0,135,135,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,2024/02/14,5303,5169,58,na
2,GCF_036600875.1,PRJNA224116,SAMN38772066,JBAFXC000000000.1,na,7,7,Azorhizobium caulinodans,strain=CNM20190462,na,...,0,171,171,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,2024/02/15,5266,5119,58,na
3,GCF_036600895.1,PRJNA224116,SAMN38772064,JBAFXE000000000.1,na,7,7,Azorhizobium caulinodans,strain=CNM20190156,na,...,0,260,260,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,2024/02/14,5339,5187,58,na
4,GCF_900128725.1,PRJNA224116,SAMEA4556317,na,na,9,9,Buchnera aphidicola,strain=BCifornacula,2912,...,3,3,3,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,2023/04/26,416,377,37,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342084,GCF_037145975.1,PRJNA224116,SAMN37642828,JBBCWK000000000.1,na,3127886,3127886,Gordonia sp. MMO-8,strain=MMO-8,na,...,0,21,21,NCBI RefSeq,GCF_037145975.1-RS_2024_03_14,2024/03/14,4139,4053,65,na
342085,GCF_037137105.1,PRJNA224116,SAMN40276084,JBBBDV000000000.1,na,3128862,3128862,Serratia sp. TMDUHS_CL,strain=TMDUHS_CL,na,...,0,53,53,NCBI RefSeq,GCF_037137105.1-RS_2024_03_14,2024/03/14,5068,4926,98,na
342086,GCF_037137035.1,PRJNA224116,SAMN40288934,JBBBNW000000000.1,na,3128902,3128902,Nocardioides sp. CCNWLW239,strain=CCNWLW239,na,...,0,17,17,NCBI RefSeq,GCF_037137035.1-RS_2024_03_14,2024/03/14,5297,5226,54,na
342087,GCF_037136835.1,PRJNA224116,SAMN40269195,JBBBQR000000000.1,na,3128982,3128982,Pectobacterium sp. 1950-15,strain=1950-15,na,...,0,59,59,NCBI RefSeq,GCF_037136835.1-RS_2024_03_14,2024/03/14,4156,4028,82,na


In [4]:
# make a new genus column
df['genus'] = df['organism_name'].str.split().str[0]

In [5]:
genus_counts = df['genus'].value_counts()
genus_counts_between_10_and_20 = genus_counts[(genus_counts > 10) & (genus_counts < 100)]
genus_counts_between_10_and_20.head(10)

Pseudoxanthomonas       99
Actinomadura            96
Devosia                 96
Dubosiella              96
Chromobacterium         96
Gallibacterium          96
Dietzia                 95
Companilactobacillus    95
Flavonifractor          95
Delftia                 93
Name: genus, dtype: int64

In [6]:
# Want to check the dataframe for genus = Pseudomonas and assembly_level complete genome
pm_df = df[(df['genus'] == "Pseudomonas") & (df['assembly_level'] == 'Complete Genome')]
# df[(df['genus'] == "Pseudomonas") & (df['assembly_level'] == 'Complete Genome')]
# df[(df['genus'] == "Gallibacterium")]
pm_df

Unnamed: 0,#assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,isolate,...,scaffold_count,contig_count,annotation_provider,annotation_name,annotation_date,total_gene_count,protein_coding_gene_count,non_coding_gene_count,pubmed_id,genus
8391,GCF_002812825.1,PRJNA224116,SAMN08101547,na,na,287,287,Pseudomonas aeruginosa,strain=PB369,na,...,1,1,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,2024/02/19,6199,6050,80,na,Pseudomonas
8397,GCF_002946935.1,PRJNA224116,SAMN02887043,na,na,287,287,Pseudomonas aeruginosa,strain=F5677,na,...,1,1,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,2023/03/13,6264,6080,84,28767643,Pseudomonas
8407,GCF_003028335.1,PRJNA224116,SAMN08776459,na,na,287,287,Pseudomonas aeruginosa,strain=MRSN12280,na,...,1,1,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,2023/12/01,6653,6508,83,na,Pseudomonas
8570,GCF_003073615.1,PRJNA224116,SAMN07291537,na,na,287,287,Pseudomonas aeruginosa,strain=AR444,na,...,1,1,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,2023/03/11,6370,6222,80,na,Pseudomonas
8648,GCF_002287725.2,PRJNA224116,SAMN06547042,na,na,287,287,Pseudomonas aeruginosa,strain=PPF-1,na,...,1,1,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,2023/12/19,6440,6281,78,28934400;29546998,Pseudomonas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341592,GCF_034555375.1,PRJNA224116,SAMN38457071,na,na,3104265,3104265,Pseudomonas sp. B33.4,strain=B33.4,na,...,1,1,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,2023/12/25,5873,5724,97,na,Pseudomonas
341684,GCF_035581345.1,PRJNA224116,SAMN39061562,na,na,3110111,3110111,Pseudomonas sp. JQ170C,strain=JQ170C,na,...,1,1,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,2024/02/25,5443,5298,102,na,Pseudomonas
341745,GCF_035621475.1,PRJNA224116,SAMN39158162,na,na,3110772,3110772,Pseudomonas sp. IT1137,strain=IT1137,na,...,2,2,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,2024/01/15,4937,4760,84,na,Pseudomonas
341823,GCF_036549575.1,PRJNA224116,SAMN39456508,na,na,3114884,3114884,Pseudomonas sp. LH21,strain=LH21,na,...,1,1,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,2024/02/12,5431,5259,104,na,Pseudomonas


In [7]:
# Now we want to take the seq-taxid map kraken2 was tested on and verify that the accessions we want to use are not present.
kraken2_map_file = "/project/j/jparkin/Lab_Databases/kraken2/seqid2taxid.map"
column_names = ['accession', 'taxid']

# Read the file into a DataFrame
k2_df = pd.read_csv(kraken2_map_file, sep='\t', names=column_names)

# Extract accession number from accession column
k2_df['accession'] = k2_df['accession'].str.split('\t').str[-1]
# k2_df['taxid'] = str(df['taxid']).strip()

In [8]:
# Boolean we want to assert that the taxid is present in k2 database and that k2 was not tested on this accession number
294 in k2_df['taxid'].unique() and 'GCF_900215245.1' not in k2_df['accession'].unique()

True

In [9]:
# create a dataframe that only contains Pseudomonas genus organisms where the taxid is present in k2 map but not the accessions
filtered_df = pm_df[pm_df['taxid'].isin(k2_df['taxid']) & ~pm_df['#assembly_accession'].isin(k2_df['accession'])]

In [10]:
# we have 108 accessions (rows) that satisfy our constraints
filtered_df[['#assembly_accession', 'taxid', 'refseq_category', 'organism_name', 'genus', 'assembly_level']]

Unnamed: 0,#assembly_accession,taxid,refseq_category,organism_name,genus,assembly_level
8391,GCF_002812825.1,287,na,Pseudomonas aeruginosa,Pseudomonas,Complete Genome
8397,GCF_002946935.1,287,na,Pseudomonas aeruginosa,Pseudomonas,Complete Genome
8407,GCF_003028335.1,287,na,Pseudomonas aeruginosa,Pseudomonas,Complete Genome
8570,GCF_003073615.1,287,na,Pseudomonas aeruginosa,Pseudomonas,Complete Genome
8648,GCF_002287725.2,287,na,Pseudomonas aeruginosa,Pseudomonas,Complete Genome
...,...,...,...,...,...,...
337823,GCF_028198105.1,3019098,na,Pseudomonas sp. TUM22785,Pseudomonas,Complete Genome
337850,GCF_947090715.1,3019968,na,Pseudomonas sp. MM227,Pseudomonas,Complete Genome
337869,GCF_028010285.1,3020843,na,Pseudomonas sp. Q1-7,Pseudomonas,Complete Genome
337886,GCF_028198185.1,3020907,na,Pseudomonas sp. JBR1,Pseudomonas,Complete Genome


In [11]:
import random
random.seed(10)
lst = list(filtered_df['#assembly_accession'])
random.shuffle(lst)

# take 20 random accessions from our list and add them to a file
for a in lst[:20]:
    print(a)

GCF_003850365.1
GCF_002997005.1
GCF_001721825.1
GCF_013343515.1
GCF_001746815.1
GCF_035621255.1
GCF_000007565.2
GCF_002068135.1
GCF_036232625.1
GCF_016925575.1
GCF_035205985.1
GCF_028747945.1
GCF_023101285.1
GCF_027571285.1
GCF_000931465.1
GCF_019434095.1
GCF_022570415.1
GCF_003991465.1
GCF_008693965.1
GCF_021390155.1


In [12]:
a = pm_df['organism_name'].value_counts()
a = a[a > 5]
a

Pseudomonas aeruginosa                          770
Pseudomonas putida                               57
Pseudomonas chlororaphis                         40
Pseudomonas protegens                            21
Pseudomonas fluorescens                          18
Pseudomonas chlororaphis subsp. aurantiaca       15
Pseudomonas syringae                             13
Pseudomonas syringae pv. actinidiae              13
Pseudomonas chlororaphis subsp. piscium          12
Pseudomonas monteilii                            12
Pseudomonas chlororaphis subsp. aureofaciens     10
Pseudomonas asiatica                              9
Pseudomonas simiae                                9
Pseudomonas brassicacearum                        9
Pseudomonas synxantha                             9
Pseudomonas mosselii                              8
Pseudomonas rhodesiae                             7
Pseudomonas citronellolis                         7
Pseudomonas siliginis                             7
Pseudomonas 

In [13]:
accessions = []
for species in a.keys():
    org_df = pm_df[pm_df['organism_name'] == species]
    lst = list(org_df['#assembly_accession'])
    random.shuffle(lst)
    accessions.extend(lst[:5])

In [14]:
len(accessions)

105

In [15]:
pm_accessions = 'pseudomonas_accessions.txt'
with open(pm_accessions, 'w') as file:
    for a in accessions:
        file.write(a + '\n')