In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import random

In [2]:
table = pq.read_table('organism_classification_data.parquet')

df_data = table.to_pandas()
df_data

Unnamed: 0,Seq,Organism,Location,Label
0,ATAGTACAGCCTGGAAAGCTTTTCGAACTGTCTGCTCTGGCCCATG...,Sulfolobus acidocaldarius,Ocean,1
1,AAGGATACCAAAACTCCTCACGGACAGCCCAGGAGGCCTCAAGCTT...,Homo sapiens,Muscle tissue,0
2,GGGGATTCCTAATGATCGGGATTTTGTGAAGTCAGCCCAGGATTGA...,Homo sapiens,Heart tissue,0
3,TTCTGCCGCTAAGTGACGCGACTGCCAGCTAGAGAAGGTGCTCCAA...,Oenococcus oeni,Soil,1
4,GGTGGCAATCTGCCCGGTGCTAGCCTATTTACGCTCGTACAAGCCG...,Staphylococcus aureus,Hot spring,1
...,...,...,...,...
995,AAGAGTCTGATATATAGACTCACTCTCCCGCGATACGACGCACCTG...,Sulfolobus acidocaldarius,Human oral cavity,1
996,GCAGAATACACCCGTCTAGTGATCCAGCTGAAGGTCTGGATTGCCC...,Escherichia coli,Soil,1
997,GGAAGGGCTATTTACTTTGTATGGGAGAGAGCTCCTGTACTGGATA...,Homo sapiens,Muscle tissue,0
998,CACGGCTCGACATAAAGGAACGCGCACCACGAGAGTCGAGAAAAGT...,Merdibacter massiliensis,River,1


In [3]:
df_data['Label'].value_counts()


Label
1    700
0    300
Name: count, dtype: int64

In [4]:
total_samples = 500
human_samples_num = int(total_samples * 0.1)  # 10% human
microbial_samples_num = total_samples - human_samples_num  # 90% microbial
print(human_samples_num)
print(microbial_samples_num)

50
450


In [5]:
human_df = df_data[df_data['Label'] == 0]
microbial_df = df_data[df_data['Label'] == 1]

In [6]:
human_df

Unnamed: 0,Seq,Organism,Location,Label
1,AAGGATACCAAAACTCCTCACGGACAGCCCAGGAGGCCTCAAGCTT...,Homo sapiens,Muscle tissue,0
2,GGGGATTCCTAATGATCGGGATTTTGTGAAGTCAGCCCAGGATTGA...,Homo sapiens,Heart tissue,0
8,TTGGTTGTGTGCGATTCAACAAGTATTGGTCACTATCTGGCTTAAA...,Homo sapiens,Blood,0
10,ACAAATTTATTTCATGAGCGCATATTGAACGCGTTGGCGCCGTACA...,Homo sapiens,Blood,0
13,GAAGGTCTGTGCGGATCGGAATGCCTGTTAGTCTAGAGGCTGGGCG...,Homo sapiens,Blood,0
...,...,...,...,...
973,GTAGTGCGCGTCCACTCGCGTAAGCGCGTAAAGTTGATGCTAGCAT...,Homo sapiens,Skin tissue,0
977,GCGCGAACTATTGTGTAATCGTACCACACGCTTGCGCAAGTAAGCA...,Homo sapiens,Liver tissue,0
982,AGTCACTTTGACTCCACGAGCGTTTGAGTGGCGGTATGTTTGAGCC...,Homo sapiens,Lung tissue,0
993,TTCTCCGGTGGCGTCCTTAATTGCTCACCGAAAGACTGAGACGCAA...,Homo sapiens,Cerebrospinal fluid,0


In [7]:
microbial_df

Unnamed: 0,Seq,Organism,Location,Label
0,ATAGTACAGCCTGGAAAGCTTTTCGAACTGTCTGCTCTGGCCCATG...,Sulfolobus acidocaldarius,Ocean,1
3,TTCTGCCGCTAAGTGACGCGACTGCCAGCTAGAGAAGGTGCTCCAA...,Oenococcus oeni,Soil,1
4,GGTGGCAATCTGCCCGGTGCTAGCCTATTTACGCTCGTACAAGCCG...,Staphylococcus aureus,Hot spring,1
5,TCATCCGCCTCAATCCTCCAACGCCACCCGCGGCACCTGGCAACGT...,Pseudomonas aeruginosa,Dairy product,1
6,AGTCGAACATTGATAACGACGCTATTGCCGTAGGGGTTTTTCTGTG...,Oenococcus oeni,Human gut,1
...,...,...,...,...
994,TCGACAGGAGCAGCCGTTCGACTTCGGGGAAGCCCAAGGAGCCAAA...,Oenococcus oeni,Human oral cavity,1
995,AAGAGTCTGATATATAGACTCACTCTCCCGCGATACGACGCACCTG...,Sulfolobus acidocaldarius,Human oral cavity,1
996,GCAGAATACACCCGTCTAGTGATCCAGCTGAAGGTCTGGATTGCCC...,Escherichia coli,Soil,1
998,CACGGCTCGACATAAAGGAACGCGCACCACGAGAGTCGAGAAAAGT...,Merdibacter massiliensis,River,1


In [8]:
sampled_human = human_df.sample(n=human_samples_num, random_state=42)
sampled_microbial = microbial_df.sample(n=microbial_samples_num, random_state=42)

In [9]:
sampled_df = pd.concat([sampled_human, sampled_microbial])
sampled_df = sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [10]:
sampled_df

Unnamed: 0,Seq,Organism,Location,Label
0,ACACTCCTGTGCCCACACGAGTAGCTCATTGAAGGCACTGGCGTCG...,Pseudomonas aeruginosa,River,1
1,GAGAAGTAGAGCCATAGCTGAGACCAGTCGGTTGTGTTACCTATGC...,Sulfolobus acidocaldarius,Plant leaf,1
2,GCGGATAACAATATATTCTTATTCCAGGTGATAGCTCTGCCATCTT...,Escherichia coli,Human gut,1
3,TTCGGCAAGGGAGAAGATCAGGGTCGAGGCTAATTCAACGGATTAA...,Escherichia coli,Plant root,1
4,GTCAACGTCCGGAAAACTTATTCGAGACCATTTCGACCTTGCCGAA...,Oenococcus oeni,Human skin,1
...,...,...,...,...
495,CGACAGGGGGGGTCCATCGCTGATGATTTGGACTTTGTTCTTTGGA...,Escherichia coli,Lake,1
496,GTGATAAGGTATAGCGGATTCTTATGATACCTATAGACTCCAACGG...,Sulfolobus acidocaldarius,Fermented food,1
497,TACTCGTCGGGCGCACTCGGCGGCGGATGAATGCATAGATGCGATT...,Lactobacillus acidophilus,Human skin,1
498,CAATAATACCCGGCACCATGTGGGGCAAGCCGTCACAAAGTGTTCT...,Moraxella catarrhalis,Human skin,1


In [11]:
print("Label value counts")
print(sampled_df['Label'].value_counts())
print()

print("Organism value counts")
print(sampled_df[sampled_df['Label'] == 1]['Organism'].value_counts())
print()



Label value counts
Label
1    450
0     50
Name: count, dtype: int64

Organism value counts
Organism
Escherichia coli             53
Sulfolobus acidocaldarius    52
Oenococcus oeni              50
Moraxella catarrhalis        49
Clostridium difficile        49
Staphylococcus aureus        45
Merdibacter massiliensis     42
Lactobacillus acidophilus    37
Bacillus subtilis            37
Pseudomonas aeruginosa       36
Name: count, dtype: int64



In [12]:
sampled_df.to_parquet('sampled_data_10percent_human.parquet')


In [None]:
# check

In [13]:
table_sample = pq.read_table('sampled_data_10percent_human.parquet')

df_sample = table_sample.to_pandas()
df_sample

Unnamed: 0,Seq,Organism,Location,Label
0,ACACTCCTGTGCCCACACGAGTAGCTCATTGAAGGCACTGGCGTCG...,Pseudomonas aeruginosa,River,1
1,GAGAAGTAGAGCCATAGCTGAGACCAGTCGGTTGTGTTACCTATGC...,Sulfolobus acidocaldarius,Plant leaf,1
2,GCGGATAACAATATATTCTTATTCCAGGTGATAGCTCTGCCATCTT...,Escherichia coli,Human gut,1
3,TTCGGCAAGGGAGAAGATCAGGGTCGAGGCTAATTCAACGGATTAA...,Escherichia coli,Plant root,1
4,GTCAACGTCCGGAAAACTTATTCGAGACCATTTCGACCTTGCCGAA...,Oenococcus oeni,Human skin,1
...,...,...,...,...
495,CGACAGGGGGGGTCCATCGCTGATGATTTGGACTTTGTTCTTTGGA...,Escherichia coli,Lake,1
496,GTGATAAGGTATAGCGGATTCTTATGATACCTATAGACTCCAACGG...,Sulfolobus acidocaldarius,Fermented food,1
497,TACTCGTCGGGCGCACTCGGCGGCGGATGAATGCATAGATGCGATT...,Lactobacillus acidophilus,Human skin,1
498,CAATAATACCCGGCACCATGTGGGGCAAGCCGTCACAAAGTGTTCT...,Moraxella catarrhalis,Human skin,1


In [14]:
print(sampled_df[sampled_df['Label'] == 1]['Organism'].value_counts())


Organism
Escherichia coli             53
Sulfolobus acidocaldarius    52
Oenococcus oeni              50
Moraxella catarrhalis        49
Clostridium difficile        49
Staphylococcus aureus        45
Merdibacter massiliensis     42
Lactobacillus acidophilus    37
Bacillus subtilis            37
Pseudomonas aeruginosa       36
Name: count, dtype: int64


In [15]:
print(sampled_df['Label'].value_counts())


Label
1    450
0     50
Name: count, dtype: int64
