# Normalization of intensities
Due to the presence of nonnumeric data in intensity columns in df after appending metadata we have to subsetting columns and rows. Thus we have some inconvenienties, including necessity of transforming data from df after subsetting

Perhaps diverse normalizations should be isolated

In [27]:
import numpy as np
import pandas as pd

In [28]:
%run 82_label_modifying.ipynb

  interactivity=interactivity, compiler=compiler, result=result)


## 75 percentille normalization
Find 75% for each peak and subtract it from values  
Intensities have been already log-transformed, thus we substract instead of divide 

In [29]:
def percentille_normalization(df, q=0.75):
    """
    Normalize intensities in dataframe by some of their order statistic, 75 by default
    :param df: df - dataframe with all data
    :param q: float - percentile which will be denominator
    :return: df - normalized by percentile df
    """
    df = df.copy()
    df[samples] -= df[samples].quantile(q, axis=0)
    return df

Example of assignment normalized data to appropriate part of df. Dreary.

`df.loc[df.index[:-meta], samples] = percentille_normalization(df.loc[:, samples].iloc[:-meta].astype(dtype='float'), 0.75)`

Thus I have a function to compress typing

In [30]:
def normalize(df, function, *args, **kwargs):
    """
    Apply normalization function to subset of df, which is determined by samples and meta constants which should be
    defined earlier
    Modify input df
    :param df: df - dataframe with all data
    :param function: function - function which takes df and return df
    :param args: sequence - list, tuple, set or str with parameters in the right order to function
    :param kwargs: dict - dict with name: value of parameters to function
    :return:
    """
    df.loc[df.index[:-meta], samples] = function(df.loc[df.index[:-meta], samples].astype(dtype='float'), *args, **kwargs)


def find_diff(df):
    """
    Find number of rows for metainformation in df. This rows should located at the bottom of df
    Assumes that only peak names contains numbers
    Used for finding constant meta
    :param df: df - dataframe with metadata
    :return: int - number of rows which are taken by metadata
    """
    meta = df.shape[0] - df.filter(regex=r'\d+', axis=0).index.shape[0]
    return meta

## Internal standard normalization
Partial information, positive mode data should be added.

Standards in negative modes:
* PG
* PE
* ceramide

In [31]:
# mz of diverse standard adducts - H and Ac-H
standard_mzs = {'pg': [709.55189, 769.57302], 
                'pe': [740.54648, 800.56761], 
                'ceramide': [529.53310, 589.55423]}

Preciously written `normalize()` is not appropriate here because following functions need access to 'mz' column in dataframe. Because of that there is the function below.

I was tired, and architecture there is not the best

In [32]:
def normalize_with_access_to_all_cols(df, function, *args, **kwargs):
    """
    Apply normalization function to subset of df, which is determined by samples and meta constants which should be
    defined earlier
    Modify input df
    :param df: df - dataframe with all data
    :param function: function - function which takes df and return df
    :param args: sequence - list, tuple, set or str with parameters in the right order to function
    :param kwargs: dict - dict with name: value of parameters to function
    :return:
    """
    df.iloc[:-meta] = function(df.iloc[:-meta].astype(dtype='float', errors='ignore'), *args, **kwargs)

In [33]:
def standard_normalization(df, standard_mzs, precision=5):
    """
    Normalize df by standard intensities. Throw an error if no standards were found. It should be refined I think.
    Make loop with try block to reduce precision up to some value, after this perhaps we should return original df.
    :param df: df - dataframe with all data
    :param standard_mzs: dict - standard names and lists of their mzs
    :param precision: int - number of digits to round mzs before comparison
    :return: df - df with normalized intensities by standard intensities
    """
    # Find standards
    standards = find_standards(df, standard_mzs, precision)
    # Select suitable standard
    standard = select_standard(standards)
    # Normalize by standard itensities
    df = std_normalization(df, standard)
    return df


def std_normalization(df, standard):
    """
    Divide intensities in df by standard intensities
    :param df: df - dataframe with all data
    :param standard: series - series with standard intensities
    :return: df - normalized by standard concentrations df
    """
    df = df.copy()
    
    # Extract np array with values from standard series
    standard = standard[samples].values.reshape(-1)
    # Subtract standard intensities from values
    df[samples] = df[samples].astype('float', errors='ignore').subtract(standard, axis=1)
    return df


def select_standard(standards):
    """
    Function to select some standard intensities between all. Now it is just maximal intensities from all standards
    :param standards: df - dataframe with standards intensities
    :return: series - series with selected intensities
    """
    # Select maximal intensities from all standards
    return standards.astype('float', errors='ignore').max()


def find_standards(df, standard_mzs, precision=5):
    """
    Find standard's peaks in df given dictionary with their mzs
    :param df: df - dataframe with data
    :param standard_mzs: dict - standard names and lists of their mzs
    :param precision: int - number of digits to round mzs before comparison, 5 by default
    :return: df - dataframe with rows from original corresponding to standards
    """
    # Create empty df for standards
    stands = pd.DataFrame()
    
    # Compare mzs of all standards with peak's and write ones with equal to standard mzs to df
    for standard, mzs in standard_mzs.items():
        for mz in mzs:
            p = df['mz'].round(precision) == np.round(mz, precision)
            stands = stands.append(df[p])
    
    # Check whether some standards are present
    assert not stands.empty, 'No standards was found in df!\nTry less strict precision'
    return stands

## Mass normalization

Functions `normalize()`, `normalize_with_access_to_all_cols()` and `mass_normalization()` are very similar - main difference in passed range which is accessible to function. Perhaps they should be refactored in 1 function with 'range' argument

In [34]:
def normalize_by_mass(df, mass_row_name='mass'):
    """
    Apply mass normalization function to subset of df, which is determined by samples, samples_with_mass and meta
    constants which should be defined earlier. Samples should be an Index object.
    Perhaps we should remake these functions to take all that independent constants
    Modify input df
    :param df: df - dataframe with all data
    :param mass_row_name: str - name of row with mass data
    :return:
    """
    # Select intensities of samples
    samples_intensities = prepare_intensities(df)
    # Pick masses of samples
    masses = prepare_mass(df, mass_row_name)
    # Normalize
    df.loc[df.index[:-meta], samples[samples_with_mass]] = mass_norm(samples_intensities, masses)


def mass_norm(samples_intensities, masses):
    """
    Perform normalization by mass
    :param samples_intensities: df - dataframe with intensities of samples with known mass in float format
    :param masses: series - series with mass data, which is converted to float
    :return: df - dataframe with intensities normalized by mass
    """
    # Log transform mass because it is not scaled but data is
    return samples_intensities - masses.apply(np.log)


def prepare_mass(df, mass_row_name='mass'):
    """
    Get mass data from df
    :param df: df - dataframe with merged metadata
    :param mass_row_name: str - name of row with mass data
    :return: series - series with mass data, which is converted to float
    """
    # samples_with_mass should be defined
    # Pick masses of samples and convert them to float
    masses = df.loc[mass_row_name, samples_with_mass[samples_with_mass].index]
    masses = masses.astype('float')
    return masses


def prepare_intensities(df):
    """
    Get data with intensities of samples with known mass from df
    :param df: df - dataframe with merged metadata
    :return: df - subset of passed into df with intensities, which are converted to floats
    """
    # Select intensities of samples with mass and convert them to float
    samples_intensities = df.loc[df.index[:-meta], samples].loc[:, samples_with_mass]
    samples_intensities = samples_intensities.astype('float')
    return samples_intensities

## Example of usage with writing normalized dataset

In [35]:
# # Load data
# name = 'with_meta_log_transformed_substituted_NA_cleaned_control_cleaned_isotopes_cleaned_contaminants_xs_annotated_rats_neg.csv'
# df = pd.read_csv(name, index_col=0)

# # Preliminaries
# # Find number of rows with metadata in df
# meta = find_diff(df)
# # Convert samples to index object
# samples = pd.Index(samples)
# # Whether samples contain mass
# samples_with_mass = ~df.loc['mass', samples].isna()


# normalize(df, percentille_normalization)

In [36]:
# Load data
name = 'with_meta_log_transformed_substituted_NA_cleaned_control_cleaned_isotopes_cleaned_contaminants_xs_annotated_rats_neg.csv'
df = pd.read_csv(name, index_col=0)

# Preliminaries
# Find number of rows with metadata in df
meta = find_diff(df)
# Convert samples to index object
samples = pd.Index(samples)
# Whether samples contain mass
samples_with_mass = ~df.loc['mass', samples].isna()

# Normalizations
# Normalizing by 3rd quartile
normalize(df, percentille_normalization)
# Normalizing by standard intensities
normalize_with_access_to_all_cols(df, standard_normalization, standard_mzs)
# Normalize by mass
normalize_by_mass(df)


# Obsolete
# df = percentille_normalization(df)
# df = standard_normalization(df, stands)

# Write to a file
df.to_csv(f'normalized_{name}')

  interactivity=interactivity, compiler=compiler, result=result)
