In [17]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import random

In [None]:
np.random.seed(42)

organisms = [
    "Sulfolobus acidocaldarius", 
    "Oenococcus oeni", 
    "Moraxella catarrhalis", 
    "Merdibacter massiliensis",
    "Escherichia coli",
    "Bacillus subtilis",
    "Lactobacillus acidophilus",
    "Staphylococcus aureus",
    "Pseudomonas aeruginosa",
    "Clostridium difficile"
]

microbial_locations = [
    "Soil", "Hot spring", "Ocean", "Lake", "River", 
    "Human gut", "Human skin", "Human oral cavity", "Human respiratory tract",
    "Plant root", "Plant leaf", "Fermented food", "Dairy product"
]

human_locations = [
    "Blood", "Saliva", "Skin tissue", "Lung tissue", "Brain tissue", 
    "Liver tissue", "Kidney tissue", "Heart tissue", "Muscle tissue", 
    "Bone marrow", "Cerebrospinal fluid"
]

def generate_dna_seq(length=100):
    return ''.join(random.choice(['A', 'T', 'G', 'C']) for _ in range(length))

num_samples = 1000
data = []

for _ in range(700):  # 70% microbial samples
    organism = random.choice(organisms)
    location = random.choice(microbial_locations)
    seq = generate_dna_seq(random.randint(50, 150))
    data.append({
        'Seq': seq,
        'Organism': organism,
        'Location': location,
        'Label': 1  # microbial
    })

for _ in range(300):  # 30% human samples
    organism = "Homo sapiens"
    location = random.choice(human_locations)
    seq = generate_dna_seq(random.randint(50, 150))
    data.append({
        'Seq': seq,
        'Organism': organism,
        'Location': location,
        'Label': 0  # human
    })

df = pd.DataFrame(data)

df = df.sample(frac=1).reset_index(drop=True)

print(df.head())

df.to_parquet('organism_classification_data.parquet')

print(df['Label'].value_counts())
print(df['Organism'].value_counts())

                                                 Seq  \
0  ATAGTACAGCCTGGAAAGCTTTTCGAACTGTCTGCTCTGGCCCATG...   
1  AAGGATACCAAAACTCCTCACGGACAGCCCAGGAGGCCTCAAGCTT...   
2  GGGGATTCCTAATGATCGGGATTTTGTGAAGTCAGCCCAGGATTGA...   
3  TTCTGCCGCTAAGTGACGCGACTGCCAGCTAGAGAAGGTGCTCCAA...   
4  GGTGGCAATCTGCCCGGTGCTAGCCTATTTACGCTCGTACAAGCCG...   

                    Organism       Location  Label  
0  Sulfolobus acidocaldarius          Ocean      1  
1               Homo sapiens  Muscle tissue      0  
2               Homo sapiens   Heart tissue      0  
3            Oenococcus oeni           Soil      1  
4      Staphylococcus aureus     Hot spring      1  
Generated 1000 samples. Distribution:
Label
1    700
0    300
Name: count, dtype: int64

Organisms distribution:
Organism
Homo sapiens                 300
Escherichia coli              81
Sulfolobus acidocaldarius     79
Clostridium difficile         79
Oenococcus oeni               75
Moraxella catarrhalis         72
Staphylococcus aureus    

In [None]:
# check metadata to get some info
import pandas as pd

data = pd.read_csv('metadata.tsv', sep='\t')

print(data.head())

      #genome  asm_name assembly_accession   bioproject     biosample  \
0  G000005825  ASM582v2    GCF_000005825.2  PRJNA224116  SAMN02603086   
1  G000006175  ASM617v2    GCF_000006175.1  PRJNA224116  SAMN00000040   
2  G000006605  ASM660v1    GCF_000006605.1  PRJNA224116  SAMEA3283089   
3  G000006725  ASM672v1    GCF_000006725.1  PRJNA224116  SAMN02603773   
4  G000006745  ASM674v1    GCF_000006745.1   PRJNA57623  SAMN02603969   

  wgs_master seq_rel_date                                          submitter  \
0        NaN   2010/12/15  Center for Genomic Sciences, Allegheny-Singer ...   
1        NaN   2010/06/03            US DOE Joint Genome Institute (JGI-PGF)   
2        NaN   2005/06/27                                     Bielefeld Univ   
3        NaN   2004/06/04                Sao Paulo state (Brazil) Consortium   
4        NaN   2001/01/09                                               TIGR   

                                            ftp_path     img_id  ...  \
0  ftp:/

In [6]:
data.head()

Unnamed: 0,#genome,asm_name,assembly_accession,bioproject,biosample,wgs_master,seq_rel_date,submitter,ftp_path,img_id,...,coding_density,completeness,contamination,strain_heterogeneity,markers,5s_rrna,16s_rrna,23s_rrna,trnas,draft_quality
0,G000005825,ASM582v2,GCF_000005825.2,PRJNA224116,SAMN02603086,,2010/12/15,"Center for Genomic Sciences, Allegheny-Singer ...",ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,646311908,...,85.144124,98.68,1.32,0.0,377,yes,yes,yes,20,high
1,G000006175,ASM617v2,GCF_000006175.1,PRJNA224116,SAMN00000040,,2010/06/03,US DOE Joint Genome Institute (JGI-PGF),ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,646564549,...,80.167033,99.05,0.0,0.0,165,no,yes,yes,19,medium
2,G000006605,ASM660v1,GCF_000006605.1,PRJNA224116,SAMEA3283089,,2005/06/27,Bielefeld Univ,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,637000085,...,89.378688,100.0,0.68,0.0,319,yes,yes,yes,20,high
3,G000006725,ASM672v1,GCF_000006725.1,PRJNA224116,SAMN02603773,,2004/06/04,Sao Paulo state (Brazil) Consortium,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,637000348,...,82.59299,99.59,0.18,0.0,325,yes,yes,yes,20,high
4,G000006745,ASM674v1,GCF_000006745.1,PRJNA57623,SAMN02603969,,2001/01/09,TIGR,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,637000333,...,86.533164,99.86,0.03,0.0,360,yes,yes,yes,20,high


In [7]:
data.columns.to_list()

['#genome',
 'asm_name',
 'assembly_accession',
 'bioproject',
 'biosample',
 'wgs_master',
 'seq_rel_date',
 'submitter',
 'ftp_path',
 'img_id',
 'gtdb_id',
 'scope',
 'assembly_level',
 'genome_rep',
 'refseq_category',
 'release_type',
 'taxid',
 'species_taxid',
 'organism_name',
 'infraspecific_name',
 'isolate',
 'superkingdom',
 'phylum',
 'class',
 'order',
 'family',
 'genus',
 'species',
 'classified',
 'unique_name',
 'lv1_group',
 'lv2_group',
 'score_faa',
 'score_fna',
 'score_rrna',
 'score_trna',
 'total_length',
 'contigs',
 'gc',
 'n50',
 'l50',
 'proteins',
 'protein_length',
 'coding_density',
 'completeness',
 'contamination',
 'strain_heterogeneity',
 'markers',
 '5s_rrna',
 '16s_rrna',
 '23s_rrna',
 'trnas',
 'draft_quality']

In [9]:
for i in data.columns:
    print(i)

#genome
asm_name
assembly_accession
bioproject
biosample
wgs_master
seq_rel_date
submitter
ftp_path
img_id
gtdb_id
scope
assembly_level
genome_rep
refseq_category
release_type
taxid
species_taxid
organism_name
infraspecific_name
isolate
superkingdom
phylum
class
order
family
genus
species
classified
unique_name
lv1_group
lv2_group
score_faa
score_fna
score_rrna
score_trna
total_length
contigs
gc
n50
l50
proteins
protein_length
coding_density
completeness
contamination
strain_heterogeneity
markers
5s_rrna
16s_rrna
23s_rrna
trnas
draft_quality


In [10]:
data['organism_name'].head()

0                       Bacillus pseudofirmus OF4
1                         Methanococcus voltae A3
2                   Corynebacterium jeikeium K411
3                         Xylella fastidiosa 9a5c
4    Vibrio cholerae O1 biovar El Tor str. N16961
Name: organism_name, dtype: object

In [11]:
len(data)

10575

In [12]:
for i in data['organism_name']:
    print(i)

Bacillus pseudofirmus OF4
Methanococcus voltae A3
Corynebacterium jeikeium K411
Xylella fastidiosa 9a5c
Vibrio cholerae O1 biovar El Tor str. N16961
Streptococcus pyogenes M1 GAS
Neisseria gonorrhoeae FA 1090
Lactococcus lactis subsp. lactis Il1403
Shigella flexneri 2a str. 301
Sinorhizobium meliloti 1021
Chlorobium tepidum TLS
Sulfolobus solfataricus P2
Rickettsia conorii str. Malish 7
Caldanaerobacter subterraneus subsp. tengcongensis MB4
Zymomonas mobilis subsp. mobilis ZM4 = ATCC 31821
Xanthomonas campestris pv. campestris str. ATCC 33913
Methanopyrus kandleri AV19
Chlamydophila pneumoniae TW-183
Pyrobaculum aerophilum str. IM2
Streptococcus agalactiae 2603V/R
Pyrococcus furiosus DSM 3638
Fusobacterium nucleatum subsp. nucleatum ATCC 25586
Methanosarcina acetivorans C2A
Buchnera aphidicola str. Sg (Schizaphis graminum)
Xanthomonas oryzae pv. oryzae KACC 10331
Streptococcus mutans UA159
Tropheryma whipplei str. Twist
Bifidobacterium longum NCC2705
Pseudomonas putida KT2440
Chlamydop

In [15]:
data['organism_name'].value_counts()


organism_name
Sulfolobus acidocaldarius             46
Oenococcus oeni                       45
Legionella pneumophila                42
Moraxella catarrhalis                 33
Corynebacterium pseudotuberculosis    33
                                      ..
Pseudoxanthobacter soli DSM 19599      1
Merdibacter massiliensis               1
Negativicoccus massiliensis            1
Mobilibacterium timonense              1
Mucilaginibacter sp. OK098             1
Name: count, Length: 9887, dtype: int64