In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import scipy.stats
import numpy as np
import statsmodels.stats.multitest
import datetime
import time
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()
import itertools
import random

In [None]:
# load mtb data
mtb_pos_df_full = pd.read_csv(r'FR_rLC_pos_dedupnormtrim.csv')
mtb_pos_df_full.set_index('PLASMA_ID', inplace=True)
mtb_pos_df_full.shape

In [None]:
# load mtb data
numb_samples = mtb_pos_df_full.shape[0]
numb_mtbs = mtb_pos_df_full.shape[1]
samplefrac_missing = mtb_pos_df_full.isna().sum() / numb_samples
frac_missing_dict = dict(zip(samplefrac_missing.index, samplefrac_missing.values))

In [None]:
def filter_MtbsMissingInManySamples(mtb_df, frac_missing_threshold=0.9, zoom=0.01):
    """
    remove pks that are missing in frac_missing_threshold or greater fraction
    rows are samples, columns are features (mtbs)
    ok if has sample id etc as columns, as long as those do not contain missing values
    zoom paramater is what fraction of all mtbs to zoom in on y-axis of fraction of samples missing distribution
    """
    numb_samples = mtb_df.shape[0]
    numb_mtbs = mtb_df.shape[1]
    
    samplefrac_missing = mtb_df.isna().sum() / numb_samples
    
    # plot
    fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(15,5))
    axes[0].set_title('Full Distribution', pad=25, size=26)
    axes[1].set_title('Zoomed', pad=25, size=26)
    sns.histplot(samplefrac_missing, ax=axes[0], binwidth=0.05)
    sns.histplot(samplefrac_missing, ax=axes[1], binwidth=0.01)
    axes[1].set_ylim(0, zoom*numb_mtbs)
    axes[0].set_ylabel('Number of Metabolites', labelpad=25, size=22)
    axes[1].set_ylabel('')
    axes[0].set_xlabel("Fraction of Samples in which\nMetabolites are Missing", labelpad=15, size=22)
    axes[1].set_xlabel("Fraction of Samples in which\nMetabolites are Missing", labelpad=15, size=22)
    axes[0].axvline(x=frac_missing_threshold, linestyle='--')
    axes[1].axvline(x=frac_missing_threshold, linestyle='--')
    plt.show()
    
    mtbs_keepornot = samplefrac_missing <= frac_missing_threshold 
    mtbs_to_keep = mtbs_keepornot[mtbs_keepornot].index.to_list() # includes sample ID if that's a column too
    
    filt_mtb_df = mtb_df.loc[:, mtbs_to_keep]
    
    print('kept {} out of {} mtbs ({}%)'.format(filt_mtb_df.shape[1], numb_mtbs, 
                                                   round(100*filt_mtb_df.shape[1]/numb_mtbs, 2)))
    
    return filt_mtb_df

In [None]:
# remove pks that are missing in many samples (in >=50% of samples)
mtb_pos_df_inSampFilt = ufunc.filter_MtbsMissingInManySamples(mtb_df=mtb_pos_df_full, frac_missing_threshold=0.5)

In [None]:
def impute_MissingValues(df, seed=None, min_divider=6):
    """
    rows are samples, columns are features (mtbs)
    ok if has sample id etc as columns, as long as those do not contain missing values
    set seed if want filled in values to be reproducible
    """
    df_copy = df.copy()
    for c in tqdm(df_copy.columns):
        missing_idx = df_copy[c][df_copy[c].isna()].index
        numb_missing_vals = len(missing_idx)
        if numb_missing_vals>0:
            m = min(df_copy[c][df_copy[c].notna()])
            np.random.seed(seed)
            random_val_vec = np.random.uniform(low=m/min_divider, high=m, size=numb_missing_vals)
            df_copy.loc[missing_idx, c] = random_val_vec  
    return(df_copy)

In [None]:
# impute missing values
mtb_pos_df_inSampFiltImp = ufunc.impute_MissingValues(mtb_pos_df_inSampFilt, seed=5)

In [None]:
mtb_pos_df_inSampFiltImp.to_csv('FRrLCposDedupnormtrim_inSampFiltAndImputed.csv')