# RawFiles Database

- overview of raw files.

In [None]:
from pathlib import Path, PurePosixPath
from collections import namedtuple

import pandas as pd

from src import config

RawFile = namedtuple('RawFile', 'name path bytes')
fp = config.FOLDER_DATA / config.FN_ALL_RAW_FILES
data = []
with open(fp) as f:
    for line in f:
        line = line.split()
        path = Path(line[-1])
        data.append(RawFile(path.stem, path, int(line[4])))

data = pd.DataFrame.from_records(
    data, columns=RawFile._fields, index=RawFile._fields[0])
data.head()

In [None]:
data['size_gb'] = data['bytes'] / 1024 ** 3
data

## Finding duplicates

- add a numeric index column to identify samples

In [None]:
data['num_index'] = pd.RangeIndex(stop=len(data))

In [None]:
if data.index.is_unique:
    print('Only unique files in index.')
else:
    non_unique = data.index.value_counts()
    non_unique = non_unique[non_unique > 1]
    # should this be browseable?
    display('Non-unique files', non_unique)

In [None]:
data.loc[non_unique.index] if not data.index.is_unique else None

Files with the same name and the same size are considered the same.

In [None]:
mask_to_remove = data.loc[non_unique.index, 'bytes'].duplicated()
# data.loc[mask_to_remove, 'path']
data_to_remove = data.loc[non_unique.index][mask_to_remove]
data_to_remove

In [None]:
print(f"Save {data_to_remove['size_gb'].sum():1.0f} GB disk space")

In [None]:
data_unique = data.reset_index().set_index('num_index').drop(data_to_remove.set_index('num_index').index).set_index('name')
data_unique

Export file paths to file to remove them, e.g using `rm $(<filenames.txt))` following [this description](https://stackoverflow.com/a/18618543/9684872).

In [None]:
fp = config.FOLDER_PROCESSED / f'{config.FN_ALL_RAW_FILES}'
fp = fp.parent / f'{fp.stem}_duplicated{fp.suffix}'

with open(fp, 'w') as f:
    for _path in data_to_remove['path']:
        _path = PurePosixPath(_path)
        f.write(f'{_path}\r\n')

In [None]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(ncols=2, gridspec_kw={"width_ratios": [
                         5, 1], "wspace": 0.3}, figsize=(16, 8))
data_unique['size_gb'].plot.hist(bins=30, ax=axes[0])
data_unique['size_gb'].plot(kind='box', ax=axes[1])
fig.savefig(config.FIGUREFOLDER / 'raw_file_overview.pdf')

## For quantified samples
- show scatter plot between sample size and number of quantified peptides