# Rawfile and MaxQuant output folder renaming

- generated using `workflows/metadata`
- all raw files collected ~50,000
- creates lftp upload commands

In [None]:
from pathlib import Path, PurePosixPath
import pandas as pd
import yaml


def rename(fname, new_sample_id, new_folder=None, ext=None):
    fname = PurePosixPath(fname)
    if ext is None:
        ext = fname.suffix
    if new_folder is None:
        new_folder = fname.parent
    else:
        new_folder = PurePosixPath(new_folder)
    fname = new_folder / new_sample_id
    fname = fname.with_suffix(ext)
    return fname.as_posix()

## Arguments

In [None]:
fn_rawfile_metadata: str = 'data/rawfile_metadata.csv'  # Machine parsed metadata from rawfile workflow
fn_mq_summaries: str = 'data/samples_selected_summaries.csv'  # MaxQuant summary files
fn_files_selected: str = 'data/samples_selected.yaml'  # selected files based on threshold of identified peptides
out_folder: str = 'data/rename'  # output folder
fn_server_log: str = 'data/rename/mq_out_server.log'  # server log of all uploaded files

In [None]:
out_folder = Path(out_folder)
out_folder.mkdir(exist_ok=True)

files_out = dict()

### Machine metadata

- read from file using [ThermoRawFileParser](https://github.com/compomics/ThermoRawFileParser)

In [None]:
df_meta = pd.read_csv(fn_rawfile_metadata, header=[0, 1], index_col=0, low_memory=False)
date_col = ('FileProperties', 'Content Creation Date')
df_meta[date_col] = pd.to_datetime(
    df_meta[date_col])
df_meta.sort_values(date_col, inplace=True)
msg = f"A total of {len(df_meta)} raw files could be read using the ThermoFisherRawFileParser."
print(msg)
df_meta

In [None]:
meta_stats = df_meta.describe(include='all')
meta_stats.T

# Erda Paths

In [None]:
cols_identifies = [('FileProperties', 'Pathname'),
                   ('FileProperties', 'Version'),
                   ('FileProperties', 'Content Creation Date'),
                   ('InstrumentProperties', 'Thermo Scientific instrument model'),
                   ('InstrumentProperties', 'instrument attribute'),
                   ('InstrumentProperties', 'instrument serial number'),
                   ('InstrumentProperties', 'Software Version'),
                   ('InstrumentProperties', 'firmware version'),
                   ]

df_meta = df_meta[cols_identifies]
df_meta.columns = [t[-1] for t in cols_identifies]
df_meta

Replace `tmp/` with `./` (artefact)

In [None]:
df_meta['Pathname'] = df_meta['Pathname'].str.replace('tmp/', './')

In [None]:
df_meta["Instrument_name"] = (
    df_meta["Thermo Scientific instrument model"].str.replace(' ', '-')
    + '_'
    + df_meta["instrument serial number"].str.split('#').str[-1]
).str.replace(' ', '-')

df_meta["Instrument_name"].value_counts().index

Create new sample identifier

In [None]:
date_col = "Content Creation Date"
idx_all = (pd.to_datetime(df_meta[date_col]).dt.strftime("%Y_%m_%d_%H_%M")
           + '_'
           + df_meta["Instrument_name"]
           ).str.replace(' ', '-')

mask = idx_all.duplicated(keep=False)
duplicated_sample_idx = idx_all.loc[mask].sort_values()  # duplicated dumps
duplicated_sample_idx

In [None]:
df_meta['new_sample_id'] = idx_all


_n = df_meta.groupby("new_sample_id").cumcount().astype('string').str.replace('0', '')
_n[_n != ''] = '_r' + _n[_n != '']
_n.value_counts()

df_meta.loc[mask, "new_sample_id"] = df_meta.loc[mask, "new_sample_id"] + _n


df_meta.loc[mask, ["Pathname", "new_sample_id"]]

In [None]:
df_meta.loc[~mask, ["Pathname", "new_sample_id"]]

In [None]:
assert df_meta["Pathname"].is_unique
assert df_meta["new_sample_id"].is_unique

### Save new paths to disk

In [None]:
df_meta["Path_old"] = df_meta["Pathname"]

df_meta[["Path_old", "new_sample_id"]]

In [None]:
df_meta

## Selected Files

In [None]:
with open(fn_files_selected) as f:
    files_selected = yaml.safe_load(f)
print(f'Threshold: {files_selected["threshold"]:,d}')

In [None]:
df_meta.loc[files_selected["files"]]

In [None]:
mask = idx_all.duplicated()
selected = df_meta.loc[~mask].index.intersection(files_selected["files"])
df_meta.loc[selected]

In [None]:
def build_instrument_name(s):
    """Process in order, only keep one name"""
    ret = ''
    used_before = set()
    for string_w_withspaces in s:
        strings_ = string_w_withspaces.split()
        for string_ in strings_:
            if string_ not in used_before:
                ret += f'_{string_}'
        used_before |= set(strings_)
    ret = (ret[1:]  # remove _ from start
           .replace('Slot_#', '')
           .replace('slot_#', '')
           )
    return ret


(df_meta[
        [
            "Thermo Scientific instrument model",
            "instrument attribute",
            "instrument serial number",
        ]
]
    .sample(20)
    .apply(build_instrument_name, axis=1)
)

In [None]:
fname = out_folder / 'selected_old_new_id_mapping.csv'
files_out[fname.name] = fname.as_posix()
df_meta.loc[selected].to_csv(fname)
fname

### OS rename

In [None]:
df_meta.loc[selected][["Path_old", "new_sample_id"]]

In [None]:
(df_meta
 .loc[selected, "Path_old"]
 .iloc[:3]
 .to_csv(out_folder / 'rawfiles_to_checksum.txt',
         index=False,
         header=False)
 )

Save summaries for selected files

In [None]:
df_summaries = pd.read_csv(fn_mq_summaries, index_col=0)
df_summaries = df_summaries.loc[selected].rename(df_meta.loc[selected, 'new_sample_id'])
df_summaries.to_csv(out_folder / 'mq_summaries.csv')
del df_summaries

## Put files on PRgIDE FTP server

rename using `new_sample_id`

### LFTP commands - raw files

`-f` option allows to pass commands from a file
One needs to at least an `open` as the first line to log in to an ftp server
For pride one needs to additionally `cd` to the correct folder:
```bash
> open ...
> cd ...
```
to allow parallell commands, use the runtime setting
```bash
>>> cat ~/.lftprc
set cmd:parallel 2
```

Create folders on pride for raw files

In [None]:
df_meta["folder_raw"] = "./raw_files/" + df_meta["Instrument_name"]
df_meta["folder_raw"].unique()

fname = out_folder / 'raw_file_directories.txt'

commands = 'mkdir -p ' + df_meta.loc[selected, "folder_raw"].drop_duplicates()
commands.to_csv(fname, header=False, index=False)

Create upload commands of raw files to create folders (could be combined with above)

In [None]:
commands = df_meta.loc[selected]
commands = (
    'put '
    + commands['Path_old'].astype('string')
    + ' -o '
    + "./raw_files/"
    + commands["Instrument_name"]
    + '/'
    + commands['new_sample_id'] + '.raw'
)
print(commands.sample(10).to_csv(sep=' ', header=False, index=False))

write all to file

In [None]:
fname = out_folder / 'lftp_commands_rawfiles.txt'
commands.to_csv(fname, header=False, index=False)

### LFTP commands - MaxQuant output

Create upload commands of MaxQuant output folders to pride using mirror

- `mq_out` folder
- move from `Sample ID` folder into `new_sample_id` on erda

In [None]:
commands = df_meta.loc[selected]
commands = (
    "mirror -R --only-missing --log log_lftp_mirror.log --exclude-glob *.pdf "  # command
    + "mq_out/" + commands.index  # source
    + " ./MQ_tables/" + commands["Instrument_name"] + "/" + commands["new_sample_id"]  # dest
)

print(commands.sample(10).to_csv(header=False, index=False))

write all to file

In [None]:
fname = out_folder / 'lftp_commands_mq_output.txt'
commands.to_csv(fname, header=False, index=False)