# Import Packages

In [9]:
# built-in
import os
from os import path

# third-party (install required)
import pandas as pd
from pymodulon.io import load_json_model

# Load Data

## Define Data Paths

In [131]:
precise2_path = '../data/precise2'
precise1k_path = '../data/precise1k'
minicoli_path = path.join(precise2_path, 'minicoli')
ets_path = path.join(precise2_path, 'ets')

## Load Data, Metadata, and QC Stats

In [123]:
precise2_log_tpm = pd.read_csv(path.join(precise2_path, 'log_tpm.csv'), index_col=0)
minicoli_log_tpm = pd.read_csv(path.join(minicoli_path, 'log_tpm.csv'), index_col=0)
ets_log_tpm = pd.read_csv(path.join(ets_path, 'log_tpm__p2_eep_hmp.csv'), index_col=0)

precise2_metadata = pd.read_csv(path.join(precise2_path, 'metadata.tsv'), index_col=0, sep='\t')
precise2_metadata = precise2_metadata.rename(columns={'project_id': 'project', 'condition_id': 'condition'})
minicoli_metadata = pd.read_csv(path.join(minicoli_path, 'metadata.tsv'), index_col=0, sep='\t')
# file name says "QC" but actually all of these were passed for the purposes of the ETS project analysis
ets_metadata = pd.read_csv(path.join(ets_path, 'metadata_qc.csv'), index_col=0)
ets_metadata = ets_metadata.rename(columns={'project_id': 'project', 'condition_id': 'condition'})

# precise2 has EEP attached; remove these so we don't duplicate
precise2_qc_stats = pd.read_csv(path.join(precise2_path, 'multiqc_stats.tsv'), index_col=0, sep='\t')
precise2_qc_stats = precise2_qc_stats.iloc[:820]
minicoli_qc_stats = pd.read_csv(path.join(minicoli_path, 'multiqc_stats.tsv'), index_col=0, sep='\t')
ets_qc_stats = pd.read_csv(path.join(ets_path, 'multiqc_stats.tsv'), index_col=0, sep='\t')

## Isolate ETS Samples to Use

Not all ETS samples are publishable

In [124]:
ets_samples_to_include = [
    'ecoli_eep_001', 'ecoli_eep_002', 'ecoli_eep_003', 'ecoli_eep_004', 'ecoli_eep_005', 'ecoli_eep_006', 'ecoli_eep_007', 'ecoli_eep_008',
    'ecoli_eep_081', 'ecoli_eep_082', 'ecoli_eep_083', 'ecoli_eep_084', 'ecoli_eep_085', 'ecoli_eep_086', 'ecoli_eep_087', 'ecoli_eep_088',
    'ecoli_eep_089', 'ecoli_eep_090', 'ecoli_eep_091', 'ecoli_eep_092', 'ecoli_eep_093', 'ecoli_eep_094', 'ecoli_eep_095', 'ecoli_eep_096',
    'ecoli_eep_097', 'ecoli_eep_098', 'ecoli_eep_099', 'ecoli_eep_100', 'ecoli_eep_101', 'ecoli_eep_102', 'ecoli_eep_103', 'ecoli_eep_104',
    'ecoli_eep_105', 'ecoli_eep_106', 'ecoli_eep_107', 'ecoli_eep_108', 'ecoli_eep_109', 'ecoli_eep_110', 'ecoli_eep_111', 'ecoli_eep_112',
    'ecoli_eep_113', 'ecoli_eep_114', 'ecoli_eep_115', 'ecoli_eep_116', 'ecoli_eep_117', 'ecoli_eep_118', 'ecoli_eep_119', 'ecoli_eep_120',
    'ecoli_eep_121', 'ecoli_eep_122', 'ecoli_eep_123', 'ecoli_eep_124'
]

ets_log_tpm_to_use = ets_log_tpm[ets_samples_to_include]
ets_metadata_to_use = ets_metadata.loc[ets_samples_to_include]
ets_qc_stats_to_use = ets_qc_stats.loc[ets_samples_to_include]

## Load DDB Object

The DDB data is currently only shared in JSON model format (no QC stats available; assuming everything passes QC)

In [37]:
ddb_object = load_json_model(path.join(precise2_path, '20211006_DDB.json'))

In [125]:
ddb_projects = ['DDB2', 'DDB3']
ddb_samples = ddb_object.sample_table[ddb_object.sample_table['project'].isin(ddb_projects)].index

ddb_log_tpm = ddb_object.X[ddb_samples]
ddb_metadata = ddb_object.sample_table.loc[ddb_samples]
# use old sample ID as sample ID column
ddb_metadata = ddb_metadata.rename(columns={'old_sample_id': 'sample_id'})

# Merge Data

Merge together all of this data

In [126]:
log_tpm_1k = precise2_log_tpm.merge(
    minicoli_log_tpm,
    left_index=True, right_index=True
).merge(
    ets_log_tpm_to_use,
    left_index=True, right_index=True
).merge(
    ddb_log_tpm,
    left_index=True, right_index=True
)

metadata_1k = pd.concat([
    precise2_metadata,
    minicoli_metadata,
    ets_metadata_to_use,
    ddb_metadata
])

multiqc_stats_1k = pd.concat([
    precise2_qc_stats,
    minicoli_qc_stats,
    ets_qc_stats_to_use
])

# Save Data

In [132]:
log_tpm_1k.to_csv(path.join(precise1k_path, 'log_tpm.csv'))
metadata_1k.to_csv(path.join(precise1k_path, 'metadata.csv'))
multiqc_stats_1k.to_csv(path.join(precise1k_path, 'multiqc_stats.csv'))