# RawFiles Database

- overview of raw files.

Created data and figures

```bash
'data/all_raw_files_dump_duplicated.txt'
'data/all_raw_files_dump_unique.csv' # csv file
'Figures/raw_file_overview.pdf'
```

and uses 

```bash
'data/all_raw_files_dump.txt'
```



In [None]:
from pathlib import Path, PurePosixPath
from collections import namedtuple
from functools import partial

import pandas as pd

import logging
from src.logging import setup_logger
from src import config
from vaep import utils

cfg = config.Config()

logger = logging.getLogger('vaep')
logger = setup_logger(logger, fname_base='04_all_raw_files.ipynb')

RawFile = namedtuple('RawFile', 'name path bytes')
cfg.FN_ALL_RAW_FILES = config.FOLDER_DATA / config.FN_ALL_RAW_FILES
data = []
with open(cfg.FN_ALL_RAW_FILES) as f:
    for line in f:
        line = line.split()
        path = Path(line[-1])
        data.append(RawFile(path.stem, path, int(line[4])))

data = pd.DataFrame.from_records(
    data, columns=RawFile._fields, index=RawFile._fields[0])
data.sort_values(by='path', inplace=True)
data.head()

In [None]:
data['size_gb'] = data['bytes'] / 1024 ** 3
data

## Finding duplicates

- add a numeric index column to identify samples

In [None]:
data['num_index'] = pd.RangeIndex(stop=len(data))

In [None]:
if data.index.is_unique:
    print('Only unique files in index.')
else:
    non_unique = data.index.value_counts()
    non_unique = non_unique[non_unique > 1]
    # should this be browseable?
    display('Non-unique files', non_unique)
    print(f'Number of files with more than 2 duplicates: {(non_unique > 2).sum()}')

For same sized groups, remove first the onces in the `MNT` folder:

In [None]:
data_to_remove = None
non_unique_remaining = None
if not data.index.is_unique:
    _data_to_remove = data.loc[non_unique.index]
    data_to_remove = pd.DataFrame()
    non_unique_remaining = pd.DataFrame()
    for idx, g in _data_to_remove.groupby(level=0):
        mask = ['\\MNT' in str(x) for x in g.path]
        data_to_remove = data_to_remove.append(g[mask])
        non_unique_remaining = non_unique_remaining.append(g[[x!=True for x in mask]])
        
assert len(data.loc[non_unique.index]) == len(non_unique_remaining) + len(data_to_remove)
data_to_remove

The main junk of duplicated files in in `MNT` subfolders

In [None]:
non_unique_remaining

Files with the same name and the same size are considered the same.

In [None]:
mask_to_remove = non_unique_remaining['bytes'].duplicated(keep='last')
data_to_remove = data_to_remove.append(non_unique_remaining[mask_to_remove])
assert len(data_to_remove) == 1037 , 'File appended twice?'
data_to_remove

In [None]:
print(f"Save {data_to_remove['size_gb'].sum():1.0f} GB disk space by deleting {len(data_to_remove)} files.")

In [None]:
data_unique = data.reset_index().set_index('num_index').drop(data_to_remove.set_index('num_index').index).set_index('name')
data_unique

Show files which are duplicated, but have different sizes:

In [None]:
# two files have the same name, but different sizes
data_unique.loc[data_unique.index.duplicated(False)] if not data_unique.index.is_unique else None

Save unique files

In [None]:
cfg.FN_ALL_RAW_FILES_UNIQUE = utils.append_to_filepath(cfg.FN_ALL_RAW_FILES, config.build_df_fname(data_unique, 'unique'), new_suffix='csv')
data_unique.to_csv(cfg.FN_ALL_RAW_FILES_UNIQUE)

Export file paths to file to remove them, e.g using `rm $(<filenames.txt))` following [this description](https://stackoverflow.com/a/18618543/9684872).

```bash
# remove empty lines
cat all_raw_files_dump_duplicated.txt | grep .raw > all_raw_files_dump_duplicated_cleaned.txt
ls `cat all_raw_files_dump_duplicated_cleaned`
rm -i `cat all_raw_files_dump_duplicated_cleaned`
rm -i $(<all_raw_files_dump_duplicated_cleaned.txt)
```

In [None]:
cfg.FN_ALL_RAW_FILES_DUPLICATED = utils.append_to_filepath(cfg.FN_ALL_RAW_FILES, 'duplicated')

with open(cfg.FN_ALL_RAW_FILES_DUPLICATED, 'w') as f:
    for _path in data_to_remove['path']:
        _path = PurePosixPath(_path)
        f.write(f'{_path}\r\n')

In [None]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(ncols=2, gridspec_kw={"width_ratios": [
                         5, 1], "wspace": 0.3}, figsize=(16, 8))
data_unique['size_gb'].plot.hist(bins=30, ax=axes[0])
data_unique['size_gb'].plot(kind='box', ax=axes[1])


cfg.raw_file_overview = config.FIGUREFOLDER / 'raw_file_overview.pdf'

fig.savefig(cfg.raw_file_overview)

## Find fractionated samples for raw files

- franctionated samples need to be processed together

In [None]:
import ipywidgets as widgets

queries = set()

def find_indices_containing_query(query, X):
    mask = X.index.str.contains(query)
    X_query = X.loc[mask].sort_index()
    queries.add(query)
    return X_query

def get_unique_stem(query, index:pd.Index):
    """Gets stem filename, by splitting filename left of query and remove last underscore _.
    
    Fractionated samples seem to be named by fraction type. Last field indicates fraction.
    """
    ret = index.str.split(query).str[0].str.rsplit('_', n=1).str[0]
#     ret = index.str.rsplit('_', n=1).str[0]
    return sorted(list(set(ret)))

def show_fractions(stub:str, df):
    subset = df[df.index.str.contains(stub)]
    print(repr(stub))
    display(subset)
    display(f'N: {len(subset)}')

In [None]:
file_names = data_unique.index

find_indices_containing_query = partial(find_indices_containing_query, X=data_unique)

In [None]:
q = '[Ff]rac'
df_selected = find_indices_containing_query(q)
df_selected.index

In [None]:
frac_unique = get_unique_stem(q, df_selected.index)

In [None]:
# samples where current approach of spliting based on frac does not work.
# frac denotes here the total number of fractions (3, 6, 8, 12, 24, 46)

frac_special_cases = [
    # continue with samples below 2019 (select in DropDown below)
    '20180508_QE3_nLC5_DBJ_DIAprot_HELA_500ng_GPF',
    '20180528_QE5_Evo2_DBJ_DIAprot_HeLa_500ng',
    '20190108_QE7_Evo1_DBJ_SA_LFQpho_HELA_PACs_200ug', # s mssing in LFQphos
    '20190108_QE7_Evo1_DBJ_SA_LFQphos_HELA_PAC_200ug',
    '20190108_QE7_Evo1_DBJ_SA_LFQphos_HELA_PAC_300ug',
    '20190108_QE7_Evo1_DBJ_SA_LFQphos_HELA_PAC_400ug',
    '20190212_QE5_Evo1_DBJ_LFQprot',
    '20190314_QE3_DBJ_Evo2_LFQphos_Hela_200ug_StageTip',
    '20190314_QE3_DBJ_Evo2_LFQphos_Hela_380ug_StageTip', # first t missing in StagetTip
    '20190314_QE3_DBJ_Evo2_LFQphos_Hela_380ug_StagetTip',
    '20190402_QE3_Evo1_DBJ_DIAprot_HELA',
    '20190402_QE3_Evo1_DBJ_LFQprot_HELA',
    '20190430_QE3_Evo2_DBJ_HELA_14cmCol_60degrees_5min',
    '20190430_QE3_Evo2_DBJ_LFQprot_HELA-14cmCol_44min',
    '20190507_QE5_Evo1_DBJ_LFQprot_Subcell_HeLa_Ctrl',
    '20190507_QE5_Evo1_DBJ_LFQprot_Subcell_library_HeLa_Ctrl_Ani_Mix',
    '20190622_EXP1_Evo1_AMV_SubCell-library-HeLa_21min-30000',
    '20190628_EXP1_Evo1_AMV_SubCell-library-HeLa_21min-30000',   
]

# exclude keys and handle separately. Remaining keys can be used directly to create list of inputs.
frac_unique = sorted(list(set(frac_unique) - set(frac_special_cases)))

In [None]:
w_data = widgets.Dropdown(options=frac_unique, index=0)
show_fractions = partial(show_fractions, df=df_selected)
out_sel = widgets.interactive_output(show_fractions, {'stub': w_data})
widgets.VBox([w_data, out_sel])
#stub, export

- `frac12` indicates 12 splits. If there are more, some of them were re-measured, e.g. `0190920_QE3_nLC3_MJ_pSILAC_HeLa_48h_Frac01_Rep3_20190924081042`


In [None]:
queries

## For quantified samples
- show scatter plot between sample size and number of quantified peptides

## Meta data for all samples

In [None]:
import src.analyzers; import importlib; importlib.reload(src.analyzers)
from src.analyzers import AnalyzePeptides
analysis = AnalyzePeptides(cfg.FN_ALL_RAW_FILES_UNIQUE) # ToDo: Add numbers to file names
analysis.df

In [None]:
analysis.add_metadata(add_prop_not_na=False)

Metadata has to casses less due to duplicates with differnt file sizes ( see above)

In [None]:
analysis.df.loc[analysis.df.index.duplicated(False)] # keep the larger one

### Profiling report
using pandas-profiling library

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(analysis.df_meta, title="Pandas Profiling Report")
profile

## cfg

In [None]:
vars(cfg) # return a dict which is rendered differently in ipython