## check raw corruption and scan loss
*  @Data: May 13th, 2025
*  @Author: maopengzhi@foxmail.com

## Usage Instructions:

* Replace `<dpath>` with your folder path containing raw files
* Requires pLink3 to have completed spectra extraction
* Three error types:
  1. Missing scans
  2. Partially corrupted
  3. Unable to open
* Output file: `raw_info_and_error.csv`
  * `raw_error=1` indicates a corrupted raw file
  * The `rate0` column shows the proportion of missing scans

In [None]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

pd.set_option('display.max_colwidth', 100) 
pd.set_option('display.width', 100)

In [None]:
# raw folder after pLink3 extraction
dpath = r'E:\data'

fpaths = [str(x) for x in Path(dpath).glob('*.pfc')]
print(len(fpaths))

In [None]:
df_ls = []
for fpath in tqdm(fpaths):
    df = pd.read_csv(fpath, sep='\t')
    df['file'] = Path(fpath).name
    df_ls.append(df)

df = pd.concat(df_ls, ignore_index=True)
print(df.shape)
print(df.columns)

In [None]:
# ms2
df2 = df[df['SpectrumType'] == 'MS2']

## ratio of MS2 to MS1

In [None]:
df_mstype = df.groupby(['file'])['SpectrumType'].apply(lambda x: x.value_counts(normalize=True)).reset_index()

df_mstype.columns = ['file', 'SpectrumType', 'ratio']
df_mstype

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    _df = df_mstype[df_mstype['SpectrumType'] == 'MS1'].reset_index(drop=True)
    _df['ratio'] = _df['ratio'].apply(lambda x: f'{x:.0%}')
    display(_df)

In [None]:
# output

df_out = df_mstype.pivot(index='file', columns='SpectrumType', values='ratio')
df_out.columns = [f"{c}_ratio" for c in df_out.columns]
df_out = df_out.reset_index()
df_out

## scan loss, peaks=0

In [None]:
df0 = df[df['NumberofPeaks']==0]
print(df0.shape)
print(df0['file'].nunique())

ser0 = df0['file'].value_counts().sort_index()
print(ser0)

In [None]:
df20 = df0[df0['SpectrumType']=='MS2']
print(df20.shape)
print(df0['file'].nunique())

ser0_ms2 = df20['file'].value_counts().sort_index()
print(ser0_ms2)

In [None]:
# proportion of zero spectral peaks

df_rate0 = df.groupby(['file'])['NumberofPeaks'].apply(lambda x: (x == 0).sum() / x.shape[0]).reset_index(name='rate0')

print(sum(df_rate0['rate0'] > 0.0))

ser0_rate = df_rate0[df_rate0['rate0'] > 0.0].sort_values(['file'])
ser0_rate

In [None]:
# output

def add_col(x, ser):
    if x in ser.index:
        return ser[x]
    else:
        return 0

df_out['peak0'] = df_out['file'].apply(lambda x: add_col(x, ser0))
df_out['peak0_ms2'] = df_out['file'].apply(lambda x: add_col(x, ser0_ms2))
df_out['rate0'] = df_out['file'].apply(lambda x: add_col(x, ser0_rate.set_index('file')['rate0']))
df_out

## scan max & number

In [None]:
_df = df.groupby(['file'])['ScanNo'].apply(lambda x: [x.nunique(), x.max()])

df_cnt = pd.DataFrame(_df.tolist(), index=_df.index, columns=['n', 'max'])
df_cnt = df_cnt.reset_index()
df_cnt['equel_max_n'] = df_cnt['n'] == df_cnt['max']
print(df_cnt['equel_max_n'].value_counts())

In [None]:
df_cnt[~df_cnt['equel_max_n']]

In [None]:
df_cnt[df_cnt['n']==0]

In [None]:
# output

_df = df_cnt.set_index('file')
df_out['scan_num'] = df_out['file'].apply(lambda x: add_col(x, _df['n']))
df_out['scan_max'] = df_out['file'].apply(lambda x: add_col(x, _df['max']))
df_out['scan_equel'] = df_out['file'].apply(lambda x: add_col(x, _df['equel_max_n']))
df_out['scan_equel'] = df_out['scan_equel'].astype(int)

df_out

## unable open

In [None]:
ls_fname_raw = []
ls_fname_raw = [str(x.stem) for x in Path(dpath).glob('*.raw')]

print(len(ls_fname_raw))

ls_fname_error = []


ls_fstem = list([Path(x).stem for x in df['file'].unique()])

ls_fname_error = set(ls_fname_raw) - set(ls_fstem)
print(len(ls_fname_error))

print(ls_fname_error)

In [None]:
# output

df_out['can_open'] = 1

file_idx = list(df_out.columns).index('file')
ls_ls = []
for fname in ls_fname_error:
    ls = [0]*len(df_out.columns)
    ls[file_idx] = fname+'.pfc'
    ls_ls.append(ls)

df_out = pd.concat([df_out, pd.DataFrame(ls_ls, columns=df_out.columns)], ignore_index=True)
    
df_out

## incomplete raw

In [None]:
ls_raw = []

print('---'*3, '# spectra peaks = 0')
ls_peak0 = [str(Path(x).stem) for x in df_rate0[df_rate0['rate0'] > 0.0]['file'].unique()]
print(len(ls_peak0))
ls_raw.extend(ls_peak0)
ls_raw = list(set(ls_raw))
print(len(ls_raw))

print('---'*3, '# pfc incomplete')
ls_incomplete = [str(Path(x).stem) for x in df_cnt[~df_cnt['equel_max_n']]['file'].unique()]
print(len(ls_incomplete))
ls_raw.extend(ls_incomplete)
ls_raw = list(set(ls_raw))
print(len(ls_raw))

print('---'*3, '# unable to open')
print(len(ls_fname_error))
ls_raw.extend(ls_fname_error)
ls_raw = list(set(ls_raw))
print(len(ls_raw))

In [None]:
df_out['file'] = df_out['file'].apply(lambda x: Path(x).stem)

def is_error(x):

    if x['rate0'] > 0.0:
        return 1
    if x['scan_equel'] == 0:
        return 1
    if x['can_open'] == 0:
        return 1
    
    return 0

df_out['raw_error'] = df_out.apply(is_error, axis=1)

df_out['peak0_ms1'] = df_out['peak0'] - df_out['peak0_ms2']

df_out

In [None]:
print(df_out['raw_error'].value_counts())

In [None]:
# df_out = pd.DataFrame(sorted(ls_raw), columns=['file'])

fpath_out = (Path(dpath) / 'raw_info_and_error.csv')

df_out.to_csv(fpath_out, index=False)