# 2a Clean metadata

In this notebook, we will clean our metadata. Primarily, we will be de-duplicating our data in preparation for __Mash__ filtration & clustering

## Setup

In [1]:
import os
import numpy as np
import pandas as pd

import plotly.express as px

In [2]:
downloaded_species_summary = pd.read_pickle('data/interim/genome_summary_1b.pickle')
downloaded_species_metadata = pd.read_pickle('data/interim/genome_metadata_1b.pickle')


display(
    downloaded_species_summary.shape,
    downloaded_species_summary.head(),
    downloaded_species_metadata.shape,
    downloaded_species_metadata.head()
)

(257, 20)

Unnamed: 0,genome_id,genome_name,taxon_id,genome_status,genome_length,gc_content,contig_l50,contig_n50,chromosomes,plasmids,contigs,patric_cds,refseq_cds,trna,rrnacoarse_consistency,fine_consistency,checkm_completeness,checkm_contamination,genome_qualitydate_created,date_modified
0,1010840.4,Streptococcus pyogenes MGAS1882,1010840,Complete,1781029,38.0,1,1,1,0.0,1,1727,0,57,,99.5,100.0,0.0,,2015-03-16T03:17:09.594Z
1,1048264.3,Streptococcus pyogenes HKU QMH11M0907901,1048264,Complete,1908100,38.45,1,1,1,,1,1909,1865,67,,99.9,100.0,0.9,,2016-01-17T15:29:01.552Z
2,1150773.3,Streptococcus pyogenes JRS4,1150773,Complete,1811968,38.63,1,1,1,,1,1811,1671,67,,99.8,100.0,0.0,,2016-01-17T16:03:54.402Z
3,1150773.4,Streptococcus pyogenes JRS4,1150773,Complete,1811124,38.64,1,1,1,,1,1886,1890,66,,99.9,100.0,0.0,,2016-03-01T06:31:23.641Z
4,1207470.4,Streptococcus pyogenes M1 476,1207470,Complete,1831079,38.5,1,1,1,0.0,1,1929,1849,57,,97.9,100.0,5.0,,2015-03-16T03:17:09.594Z


(257, 66)

Unnamed: 0,genome_id,genome_name,organism_name,taxon_id,genome_status,strain,serovar,biovar,pathovar,mlst,...,motility,sporulation,temperature_range,optimal_temperature,salinity,oxygen_requirement,habitat,disease,comments,additional_metadata
0,1010840.4,Streptococcus pyogenes MGAS1882,Streptococcus pyogenes MGAS1882,1010840,Complete,MGAS1882,,,,MLST.Streptococcus_pyogenes.172,...,,,,,,,Host,,-,
1,1048264.3,Streptococcus pyogenes HKU QMH11M0907901,,1048264,Complete,HKU QMH11M0907901,,,,MLST.Streptococcus_pyogenes.36,...,,,,,,,,,Clinical use of next generation sequencing for...,
2,1150773.3,Streptococcus pyogenes JRS4,,1150773,Complete,JRS4,serovar emm6,,,MLST.Streptococcus_pyogenes.37,...,,,,C,,,,,We report the complete genome assemblies of th...,collected_by:Rockefeller University Lancefield...
3,1150773.4,Streptococcus pyogenes JRS4,,1150773,Complete,JRS4,,,,MLST.Streptococcus_pyogenes.37,...,No,,,,,,,Pharyngitis,Complete genome sequence of the highly invasiv...,
4,1207470.4,Streptococcus pyogenes M1 476,Streptococcus pyogenes M1 476,1207470,Complete,476,,,,MLST.Streptococcus_pyogenes.28,...,,,,,,,,Toxic shock syndrome,We report the completely annotated genome sequ...,


## De-duplicate entries

### Ensure `biosample_accession` is unique & drop duplicates

In [3]:
downloaded_species_metadata = downloaded_species_metadata.drop_duplicates(subset=['biosample_accession'])

display(
    downloaded_species_metadata.shape,
    downloaded_species_metadata.head()
)

(253, 66)

Unnamed: 0,genome_id,genome_name,organism_name,taxon_id,genome_status,strain,serovar,biovar,pathovar,mlst,...,motility,sporulation,temperature_range,optimal_temperature,salinity,oxygen_requirement,habitat,disease,comments,additional_metadata
0,1010840.4,Streptococcus pyogenes MGAS1882,Streptococcus pyogenes MGAS1882,1010840,Complete,MGAS1882,,,,MLST.Streptococcus_pyogenes.172,...,,,,,,,Host,,-,
1,1048264.3,Streptococcus pyogenes HKU QMH11M0907901,,1048264,Complete,HKU QMH11M0907901,,,,MLST.Streptococcus_pyogenes.36,...,,,,,,,,,Clinical use of next generation sequencing for...,
2,1150773.3,Streptococcus pyogenes JRS4,,1150773,Complete,JRS4,serovar emm6,,,MLST.Streptococcus_pyogenes.37,...,,,,C,,,,,We report the complete genome assemblies of th...,collected_by:Rockefeller University Lancefield...
3,1150773.4,Streptococcus pyogenes JRS4,,1150773,Complete,JRS4,,,,MLST.Streptococcus_pyogenes.37,...,No,,,,,,,Pharyngitis,Complete genome sequence of the highly invasiv...,
5,1235829.3,Streptococcus pyogenes A20,Streptococcus pyogenes A20,1235829,Complete,A20,,,,MLST.Streptococcus_pyogenes.28,...,,,,,,,,Necrotizing faciitis,A clinical strain was isolated from a Necrotiz...,


### (Optional) Ensure `assembly_accession` is unique

Most species have strains which do not have any value for this. For those species it is worth skipping this step. For our example, most strains have information for this column so we will apply this de-duplication too

In [4]:
num_na = downloaded_species_metadata['assembly_accession'].isna().sum()

print(f"{num_na} out of {downloaded_species_metadata.shape[0]} are NaNs: {int(100*num_na/downloaded_species_metadata.shape[0])}%")

13 out of 253 are NaNs: 5%


In [5]:
downloaded_species_metadata = downloaded_species_metadata.drop_duplicates(subset=['assembly_accession'])

downloaded_species_metadata.shape

(241, 66)

In [6]:
downloaded_species_summary = downloaded_species_summary.loc[downloaded_species_metadata.index]

downloaded_species_summary.shape

(241, 20)

## Save files

In [7]:
# Save files
downloaded_species_summary.to_pickle('data/interim/genome_summary_2a.pickle')
downloaded_species_metadata.to_pickle('data/interim/genome_metadata_2a.pickle')
