# Join information from metadata to main table

In [22]:
import numpy as np
import pandas as pd
from itertools import combinations, chain, product
import string
import re

In [23]:
%run 2_columns_part.ipynb

## Tissue attribute
Add row with tissue names to main df. Also annotates blanks and washes in this category. Not sure about precision of QC annotations and I'm in doubts should blanks and washes be annotated

### Enumeration of possible tissue encodings
Of course not all of them here.

In [24]:
brain_regions = ['precuneus', 'amygdala ', 'hippocampus', 'caudate ', 'putamen', 
                 'thalamus ', 'hypothalamus', 'cb', 'pfc', 'pmc', 'claustrum', 'striatum',
                 'hippocamp','cb-gm', 'ba40a', 'ba7p', 'ba7a', 'ba29/30', 'ba37-amt', 'ba47', 'ba23a',
                 'pop', 'ba18/19p', 'ba21p', 'ba6p', 'ba6prc', 'ba17p', 'ba4', 'ba41/42',
                 'ba17a', 'ba3/1/2', 'ba37-pmt', 'ba37m', 'ba8', 'ba6m', 'insa', 'ba23p',
                 'ba20p', 'ba22a', 'ba18/19a', 'ba22p', 'ba7m', 'ba6a', 'ba40p', 'ba44',
                 'ba20a', 'ba9', 'ba9m', 'ba8m', 'ba21a', 'ba10', 'ba10fp', 'ba32g', 'ba39',
                 'ba25', 'ba38', 'ba46', 'ba45', 'ba10m', 'ba31', 'ba24', 'ba32', 'ba11',
                 'pirctx', 'amg', 'insp', 'entctx', 'ca3/dg', 'nacc', 'caud', 'put', 'dentn',
                 'supcll', 'pulth', 'rn', 'sn', 'gp', 'ca1', 'sub', 'vath', 'mdth', 'hyp',
                 'vlth', 'ec', 'ic', 'cca', 'ccp', 'cb-wm']
other_tissues = ['brain', 'plasma', 'muscle', 'liver', 'epithelium',
                 'endoneurium', 'perineurium', 'epineurium', 'vascular',
                 'connective', 'lymphoid', 'meristem']
tech = ['wash', 'blank', 'qc']
some_punct = ['.', '_', '|', '$', '#', '@']

# Create combinations of brain and region separated by something
brain_encodings = list(''.join(x) for x in product(['brain'], some_punct, brain_regions))

# List of tissue names
tissues = brain_encodings + other_tissues + brain_regions + tech

In [25]:
def create_labels_tissue(samples, labs=tissues, name='tissue', punct=string.punctuation):
    """
    Create series with tissue name
    :param samples: iterable - collection of samples names in original dataframe
    :param labs: iterable - collection of new appropriate names of samples which express some category, e.g. tissue
    :param name: str - name of series, tissue by default
    :param punct: iterable - collection of possible separators in original names
    :return: series - series with tissue affiliation of samples
    """
    res = create_labels(samples, labs, punct)
    res.name = name
    return res


def create_labels(samples, labs, punct=string.punctuation):
    """
    Create series with new names of samples
    :param samples: iterable - collection of samples names in original dataframe
    :param labs: iterable - collection of new appropriate names of samples which express some category, e.g. tissue
    :param punct: iterable - collection of possible separators in original names
    :return: series - series with new names of samples
    """
    # Create list to store labels
    res = []

    # Find appropriate name of type for sample
    # Make it '_' separated and add it to list with labels
    # Add NA if no matches with labs were found
    for sample in samples:
        for lab in labs:
            if lab in sample.lower():
                lab = punct_check(lab, punct)
                res.append(lab)
                break
        else:
            res.append(np.nan)

    # Create a series
    res = pd.Series({c: v for c, v in zip(samples, res)})
    return res


def punct_check(label, punct):
    """
    Replace arbitrary separator by '_' in label name
    :param label: str - label name
    :param punct: iterable - collection of possible separators
    :return: str - cleaned label
    """
    # For all separators in punct check its presence in label
    # Replace separator with '_' if it is in label
    for mark in punct:
        if mark in label:
            return label.replace(mark, '_')
    return label

In [26]:
# Redundant, cause I've added these categories into tissues
# labs[blanks] = 'blank'
# labs[washes] = 'wash'

## Age, mass attributes

In [27]:
# Metainformation load
metadata = pd.read_csv('Rats_vitamin-D_ms-order__.csv')

In [28]:
def compose_metadata(df, metadata):
    """
    Prepare metadata to main df compatible structure (aligned by sample names)
    :param df: df - original dataframe
    :param metadata: df - support dataframe with metadata
    :return: df - original df with several added rows from metadata
    """
    # Extract metadata
    meta = extract_meta(metadata)
    # Add id to dataframe and transpose it for merging
    df = add_id(df).T
    
    # Merge transposed df with meta on new id column and index of meta
    # Transpose df back to arrange it in original manner
    # Looks like not the most efficient solution
    df = df.merge(meta, left_on='id', right_index=True, how='left').T
    meta = df.iloc[-(meta.shape[1] + 1):, :]
    return meta


def add_metadata(df, metadata):
    """
    Add some rows from metadata to df
    :param df: df - original dataframe
    :param metadata: df - support dataframe with metadata
    :return: df - original df with several added rows from metadata 
    """
    # Extract metadata
    meta = extract_meta(metadata)
    # Add id to dataframe and transpose it for merging
    df = add_id(df).T

    # Merge transposed df with meta on new id column and index of meta
    # Transpose df back to arrange it in original manner
    df = df.merge(meta, left_on='id', right_index=True, how='left').T
    return df
    

def extract_meta(metadata):
    """
    Find and extract age, mass and id columns from metadata
    :param metadata: df - dataframe with metainformation to extract
    :return: df - subset from given dataframe with some meta columns
    """
    # Find age
    pat = re.compile(r'age', re.I)
    age = extract_column(metadata, pat)
    # Find weight
    pat = re.compile(r'mass|weight', re.I)
    mass = extract_column(metadata, pat)
    # Find id columns
    pat = re.compile(r'id|ms', re.I)
    ids = extract_column(metadata, pat)

    # Merge series together
    meta = pd.concat([ids, age, mass], axis=1).set_index(ids.name)
    meta.rename({age.name: 'age', mass.name: 'mass'}, axis=1, inplace=True)
    return meta


def extract_column(df, pat):
    """
    Extract columns from df with specified re pattern
    :param df: df - dataframe
    :param pat: re - compiled re
    :return: column or columns from df
    """
    return df.filter(regex=pat).iloc[:, 0]


def add_id(df):
    """
    Add row with id to df
    :param df: df - original df
    :return: df - df with row 'id'
    """
    # Add identifiers
    ids = extract_id(samples)
    df = df.append(ids)
    return df
    
    
def extract_id(samples):
    """
    Extract id by which dfs are connected from samples names
    :param samples: iterable - collection with names of samples
    :return: series - series with id of each sample which had it in a form specified by pattern
    """
    # Extract identifier of each mouse
    pat = re.compile(r'ms\d+', re.I)
    res = {}
    
    # Add identifier in dict if it was found
    for s in samples:
        for i in re.finditer(pat, s):
            res[s] = i.group()
            break
    
    # Create series and name it
    res = pd.Series(res)
    res.name = 'id'
    return res

## Example of usage with writing appended dataset

In [30]:
# Load data
name = 'log_transformed_substituted_NA_cleaned_control_cleaned_isotopes_cleaned_contaminants_xs_annotated_rats_neg.csv'
df = pd.read_csv(name, index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [31]:
# Add tissue labels
labs = create_labels_tissue(samples, punct=some_punct)
df = df.append(labs)
# Add id, age, and mass information as a rows
df = add_metadata(df, metadata)

# Write to file
df.to_csv(f'with_meta_{name}')

Looks like this variant is working ok, thus the paragraph below is unnecessary

## Another variant - sseparate meta from main df

In [334]:
meta = compose_metadata(df, metadata)

In [339]:
meta.T

Unnamed: 0,id,age,mass
mz,,,
mzmin,,,
mzmax,,,
rt,,,
rtmin,,,
rtmax,,,
npeaks,,,
samples,,,
X2018.06.07_ACNblank.1,,,
X2018.06.07_ACNblank.2,,,


In [53]:
df.head()

Unnamed: 0,mz,mzmin,mzmax,rt,rtmin,rtmax,npeaks,samples,X2018.06.07_ACNblank.1,X2018.06.07_ACNblank.2,...,X2018.06.07._rats_ExtrBlank.8,X2018.06.07._rats_ExtrBlank.9,after.wash.1,after.wash.2,blank_ACNblank7,blank_ACNblank8,blank_ACNblank9,isotopes,adduct,pcgroup
19,124.005,124.003,124.005,32.5442,26.5564,47.5831,799,118,9.00294,8.47303,...,12.0046,11.2281,11.0349,10.2386,10.9377,10.0712,11.2711,[1][M]-,,2836
361,221.082,221.081,221.084,107.445,101.433,161.726,282,102,10.3084,10.2358,...,16.5211,15.754,13.2588,13.1028,13.3706,13.4195,13.2988,[10][M]-,,2109
1855,359.291,359.289,359.292,273.3,252.744,279.53,718,107,0.0,0.0,...,9.98706,0.0,9.86098,8.26709,0.0,9.10618,10.0829,[100][M]-,,3014
1920,363.245,363.245,363.248,243.563,214.678,267.899,564,130,10.5478,11.0731,...,14.0955,13.9494,10.9763,10.7409,11.8379,11.9421,12.4861,[101][M]-,,696
1928,363.33,363.329,363.332,319.251,314.704,340.707,1006,113,7.79333,8.92558,...,11.5072,10.3302,9.75393,8.94741,8.87136,8.05503,9.09642,[102][M]-,,3841


In [297]:
meta.join(labs)['tissue'].isna().all()

True

In [21]:
labs = create_labels_tissue(samples, punct=some_punct)
meta = meta.append(labs)