# Import Packages

In [47]:
# built-in
from datetime import datetime, timedelta
import os.path
import subprocess

# third-party (pip install required)
import matplotlib
from matplotlib.cm import get_cmap
from matplotlib.lines import Line2D
import matplotlib.pyplot as plt
import pandas as pd
from pymodulon.io import load_json_model

# set matplotlib params
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['text.color'] = 'black'
matplotlib.rcParams['axes.labelcolor'] = 'black'
matplotlib.rcParams['xtick.color'] = 'black'
matplotlib.rcParams['ytick.color'] = 'black'
matplotlib.rcParams['font.sans-serif'] = "Arial"
matplotlib.rcParams['font.family'] = "sans-serif"

# Load Data

In [48]:
PRECISE_PATH = '../../data/precise/'
P1K_PATH = '../../data/precise1k/'
FIG_PATH =  'summarize_dataset_figs'

In [49]:
precise1 = load_json_model(os.path.join(PRECISE_PATH, 'precise.json.gz'))
p1k = load_json_model(os.path.join(P1K_PATH, 'precise1k.json.gz'))

In [50]:
p1k_log_tpm = pd.read_csv(os.path.join(P1K_PATH, 'log_tpm.csv'), index_col=0)
p1k_counts = pd.read_csv(os.path.join(P1K_PATH, 'counts.csv'), index_col=0)

In [53]:
proj = 'tcs'

In [56]:
sub_folder = f'/Users/cam/Downloads/{proj}_geo_submission_feb092023'
p1k_megafolder = '/Users/cam/Downloads/p1k_geo_submission_dec162022'

In [57]:
sub_folder

'/Users/cam/Downloads/tcs_geo_submission_feb092023'

In [64]:
proj_geo_meta = pd.read_csv('/Users/cam/Downloads/proj_geo.csv')
proj_geo_meta.head()

Unnamed: 0,geo_title,r1,r2
0,"del atoC, M9, LiCl/acetoacetate, rep 1",2CS_CHX_AtoCKO10mMLi-AcetR1_S7_R1_001.fastq.gz,2CS_CHX_AtoCKO10mMLi-AcetR1_S7_R2_001.fastq.gz
1,"del atoC, M9, LiCl/acetoacetate, rep 1",2CS_CHX_AtoCKO10mMLi-AcetR2_S8_R1_001.fastq.gz,2CS_CHX_AtoCKO10mMLi-AcetR2_S8_R2_001.fastq.gz
2,"del atoC, M9, LiCl, rep1",2CS_CHX_AtoCKO10mMLiCl2R1_S9_R1_001.fastq.gz,2CS_CHX_AtoCKO10mMLiCl2R1_S9_R2_001.fastq.gz
3,"del atoC, M9, LiCl, rep2",2CS_CHX_AtoCKO10mMLiCl2R2_S10_R1_001.fastq.gz,2CS_CHX_AtoCKO10mMLiCl2R2_S10_R2_001.fastq.gz
4,"del atoC, M9, rep1",2CS_CHX_AtoCKOM9R1_S5_R1_001.fastq.gz,2CS_CHX_AtoCKOM9R1_S5_R2_001.fastq.gz


In [32]:
samp_ids_ordered = p1k.sample_table[p1k.sample_table['project'] == proj].index
proj_log_tpm = p1k_log_tpm[samp_ids_ordered]
proj_log_tpm.columns = proj_geo_meta['geo_title']
proj_counts = p1k_counts[samp_ids_ordered]
proj_counts.columns = proj_geo_meta['geo_title']

proj_log_tpm.to_csv(os.path.join(sub_folder, f'{proj}_log_tpm.csv'))
proj_counts.to_csv(os.path.join(sub_folder, f'{proj}_counts.csv'))

In [73]:
# move files to submission folder; also get checksums
checksum_dict = {}
for _, row in proj_geo_meta.iterrows():
    r1_file = os.path.join(p1k_megafolder, row['r1'])
    r2_file = os.path.join(p1k_megafolder, row['r2'])
    
    checksum_dict[row['r1']] = subprocess.check_output(['md5', r1_file]).decode('utf-8').split(' = ')[1].strip()
    checksum_dict[row['r2']] = subprocess.check_output(['md5', r1_file]).decode('utf-8').split(' = ')[1].strip()
    
    subprocess.check_output(['mv', r1_file, sub_folder])
    subprocess.check_output(['mv', r2_file, sub_folder])

In [75]:
pd.Series(checksum_dict).to_csv('/Users/cam/Downloads/checksum.csv')

## TCS Temp 

In [62]:
tcs_no_geo = p1k.sample_table[(p1k.sample_table['project'] == 'tcs') & (p1k.sample_table['GEO'].isna())]

In [63]:
tcs_no_geo.to_csv('/Users/cam/Downloads/tcs_no_geo_md.csv')

In [65]:
samp_ids_ordered = tcs_no_geo.index
proj_log_tpm = p1k_log_tpm[samp_ids_ordered]
proj_log_tpm.columns = proj_geo_meta['geo_title']
proj_counts = p1k_counts[samp_ids_ordered]
proj_counts.columns = proj_geo_meta['geo_title']

proj_log_tpm.to_csv(os.path.join(sub_folder, f'{proj}_log_tpm.csv'))
proj_counts.to_csv(os.path.join(sub_folder, f'{proj}_counts.csv'))

## lab meet temp

In [143]:
p1k.sample_table['project'].value_counts().shape

(45,)