## Cell Type Integration

In [1]:
import os.path as osp
import pandas as pd
import numpy as np
from tcre import meta
from tcre import lib
from tcre.lib import SPECIES_HUMAN_ID, CELL_TYPES
from tcre.meta import ID_TYP_CT
from tcre.env import *

### Load Cell Ontology

In [2]:
SRC = 'cl'
path = osp.join(META_DATA_DIR, 'raw', 'cl.raw.csv')
df = pd.read_csv(path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 8 columns):
depth         700 non-null int64
desc          700 non-null object
id            700 non-null object
label         700 non-null object
root          700 non-null object
syn           700 non-null object
syn_typ       700 non-null object
syn_typ_id    700 non-null int64
dtypes: int64(2), object(6)
memory usage: 43.8+ KB


In [3]:
df.head()

Unnamed: 0,depth,desc,id,label,root,syn,syn_typ,syn_typ_id
0,4,A type II NK T cell that has been recently act...,CL:0000933,type II NK T cell secreting interleukin-4,CL:0000084,type II NKT cell secreting interleukin-4,exact,4
1,4,A type II NK T cell that has been recently act...,CL:0000932,type II NK T cell secreting interferon-gamma,CL:0000084,type II NKT cell secreting interferon-gamma,exact,4
2,4,An alpha-beta T cell expressing NK call marker...,CL:0000922,type II NK T cell,CL:0000084,type II NKT cell,exact,4
3,4,A type II NK T cell that has been recently act...,CL:0000933,type II NK T cell secreting interleukin-4,CL:0000084,type II NK T-lymphocyte secreting interleukin-4,exact,4
4,4,A type II NK T cell that has been recently act...,CL:0000932,type II NK T cell secreting interferon-gamma,CL:0000084,type II NK T-lymphocyte secreting interferon-g...,exact,4


In [4]:
# Make sure that each group of synonyms has a record with symbol equal to label
assert (df.groupby('label').apply(lambda g: len(g[g['syn'] == g['label']])) == 1).all()

In [5]:
df_cl_raw = df.copy()

In [6]:
df = df[['id', 'label', 'syn', 'root', 'depth']].rename(columns={'id': 'extid', 'syn': 'sym', 'label': 'lbl'}).copy()
df = df.assign(src=SRC, spid=SPECIES_HUMAN_ID, parent=None)
df.head()

Unnamed: 0,extid,lbl,sym,root,depth,src,spid,parent
0,CL:0000933,type II NK T cell secreting interleukin-4,type II NKT cell secreting interleukin-4,CL:0000084,4,cl,1,
1,CL:0000932,type II NK T cell secreting interferon-gamma,type II NKT cell secreting interferon-gamma,CL:0000084,4,cl,1,
2,CL:0000922,type II NK T cell,type II NKT cell,CL:0000084,4,cl,1,
3,CL:0000933,type II NK T cell secreting interleukin-4,type II NK T-lymphocyte secreting interleukin-4,CL:0000084,4,cl,1,
4,CL:0000932,type II NK T cell secreting interferon-gamma,type II NK T-lymphocyte secreting interferon-g...,CL:0000084,4,cl,1,


In [7]:
def get_substitutions(df):
    return pd.concat([
        df[df['sym'].str.match('.*' + ptn + '.*')].assign(sym=lambda df: df['sym'].str.replace(ptn, ''))
        for ptn in ['T.cell[s]?', '.cell[s]?', 'T.lymphocyte[s]?', '.lymphocyte[s]?']
    ])
df = pd.concat([df.assign(sub=False), get_substitutions(df).assign(sub=True)])
len(df)

1729

In [8]:
df['sub'].value_counts()

True     1029
False     700
Name: sub, dtype: int64

In [9]:
# Check substitutions for a single case
df[df['sym'].str.contains('Th17')]

Unnamed: 0,extid,lbl,sym,root,depth,src,spid,parent,sub
42,CL:0001051,"CD4-positive, CXCR3-negative, CCR6-negative, a...",non-Th1/Th17 CD4+ T cell,CL:0000084,4,cl,1,,False
365,CL:0000899,T-helper 17 cell,Th17 cell,CL:0000084,5,cl,1,,False
366,CL:0000899,T-helper 17 cell,Th17 T-lymphocyte,CL:0000084,5,cl,1,,False
367,CL:0000899,T-helper 17 cell,Th17 T-cell,CL:0000084,5,cl,1,,False
368,CL:0000899,T-helper 17 cell,Th17 T lymphocyte,CL:0000084,5,cl,1,,False
369,CL:0000899,T-helper 17 cell,Th17 T cell,CL:0000084,5,cl,1,,False
370,CL:0000899,T-helper 17 cell,Th17 CD4+ T cell,CL:0000084,5,cl,1,,False
42,CL:0001051,"CD4-positive, CXCR3-negative, CCR6-negative, a...",non-Th1/Th17 CD4+,CL:0000084,4,cl,1,,True
367,CL:0000899,T-helper 17 cell,Th17,CL:0000084,5,cl,1,,True
369,CL:0000899,T-helper 17 cell,Th17,CL:0000084,5,cl,1,,True


In [10]:
df_cl = df.drop('sub', axis=1).copy().drop_duplicates()
df_cl['id'] = meta.get_ids(df_cl, ID_TYP_CT)
df_cl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1163 entries, 0 to 483
Data columns (total 9 columns):
extid     1163 non-null object
lbl       1163 non-null object
sym       1163 non-null object
root      1163 non-null object
depth     1163 non-null int64
src       1163 non-null object
spid      1163 non-null int64
parent    0 non-null object
id        1163 non-null object
dtypes: int64(2), object(7)
memory usage: 90.9+ KB


### Manual Entries

In [11]:
SRC = 'manual'

# Not found in CL:
# - Tissue resident memory
# - Stem memory
# - Treg17
# - Follicular regulatory
# - Peripheral Treg
# - iNKT17 (iNKT1, iNKT2 are there)
# - Tc0, Tc3, Tc9, Tc22 (Tc1, Tc2, and Tc17 are there)
# - Th0, Th3 (Th1, Th2, Th9, Th17, and Th22 are there)
# - Tfh* (Tfh0, Tfh1, Tfh2, Tfh17 are all absent)

def search(term, df):
    term = term.lower()
    ids = df[df['syn'].str.lower().apply(lambda v: term in v)]['id'].unique()
    df = df.groupby('label').apply(lambda g: g if g['id'].isin(ids).any() else None)
    return df[df['syn_typ'] == 'label']
        
pd.set_option('display.max_colwidth', 200)
search('th17', df_cl_raw)

Unnamed: 0,depth,desc,id,label,root,syn,syn_typ,syn_typ_id
424,5.0,"CD4-positive, alpha-beta T cell with the phenotype RORgamma-t-positive, CXCR3-negative, CCR6-positive, and capable of producing IL-17.",CL:0000899,T-helper 17 cell,CL:0000084,T-helper 17 cell,label,5.0
619,4.0,"A CD4-positive, alpha-beta T cell that has the phenotype CXCR3-negative, CCR6-negative.",CL:0001051,"CD4-positive, CXCR3-negative, CCR6-negative, alpha-beta T cell",CL:0000084,"CD4-positive, CXCR3-negative, CCR6-negative, alpha-beta T cell",label,5.0


In [12]:
# If this gives an error, it is likely that the delimiter was typed incorrectly in one of the sym strings in the raw file
dfm = pd.read_csv(osp.join(META_DATA_DIR, 'raw', CELL_TYPES + '.manual.csv'), sep=',', comment='#', error_bad_lines=True, warn_bad_lines=True)
# Ignore hand-engineered depth feature in favor of distance within ontology
dfm = dfm.drop('lvl', axis=1)
for c in dfm.select_dtypes(include=[object]):
    dfm[c] = dfm[c].apply(lambda v: None if pd.isnull(v) or not v.strip() else v.strip())
dfm.head()

Unnamed: 0,lbl,cl0,cl1,cl2,syms,lkp
0,IEL,CL:0002496,,CL:0000084,IELs|Intraepithelial-lymphocyte|Intraepithelial-lymphocytes,
1,TN,CL:0000898,,CL:0000789,T-naïve|naïve-T,
2,TMEM,CL:0000813,,CL:0000789,T-mem|memory-T|mem-T,
3,TEMRA,CL:0001062,,CL:0000789,T-emra|emra-T,
4,TCM,CL:0000904,,CL:0000789,T-cm|cm-T|central-memory-T,


In [13]:
import unidecode

def parse(r):
    df = []
    lbl = r['lbl'].strip()
    syms = r['syms'].split('|')
    extid = r['cl0']
    parid = r['cl1']
    rootid = r['cl2']
    assert not (pd.isnull(extid) and pd.isnull(parid))
    assert not pd.isnull(rootid)
    lkp = {None:None} if pd.isnull(r['lkp']) else dict([v.split('=') for v in r['lkp'].split('|')])
    assert len(lkp) > 0

    def append(sym, lbl, extid):
        df.append((sym, lbl, extid, parid, rootid))
        
    for lk, lv in lkp.items():
        lv = None if lv is None or not lv.strip() else lv.strip()
        lblf = lbl.format(lk) if lk else lbl
        extidf = lv or extid
        append(lblf, lblf, extidf)
        for alias in syms:
            for sym in [alias, unidecode.unidecode(alias)]: 
                symf = sym.format(lk) if lk else sym
                append(symf, lblf, extidf)
                append(symf.replace('-', ' '), lblf, extidf)
                append(symf.replace('-', ''), lblf, extidf)
    return pd.DataFrame(df, columns=['sym', 'lbl', 'extid', 'parent', 'root']).drop_duplicates()

def flatten(df):
    return pd.concat([parse(r) for _, r in df.iterrows()])
    
def enrich(df):
    # Add -cell[s] -lymphocyte[s] to the end of each symbol to cover cases where 
    # the spans are not broken into separate tokens (e.g. "follicular regulatory T-cells")
    df = pd.concat([df] + [
        df[df['sym'].str.endswith('T')].assign(sym=lambda df: df['sym'] + suffix)
        for suffix in ['-cell', '-cells', '-lymphoctye', '-lymphocytes']
    ])
    df = df.assign(spid=SPECIES_HUMAN_ID, src=SRC)
    df['id'] = meta.get_ids(df, ID_TYP_CT)
    df = df.drop_duplicates()
    return df

dfc = enrich(flatten(dfm))
dfc.head()

Unnamed: 0,sym,lbl,extid,parent,root,spid,src,id
0,IEL,IEL,CL:0002496,,CL:0000084,1,manual,CT22416FE03B9D6C99
1,IELs,IEL,CL:0002496,,CL:0000084,1,manual,CTB660BADB779FAA4E
7,Intraepithelial-lymphocyte,IEL,CL:0002496,,CL:0000084,1,manual,CT54F82A76F87DFC87
8,Intraepithelial lymphocyte,IEL,CL:0002496,,CL:0000084,1,manual,CT77F7FFA741C8DF8A
9,Intraepitheliallymphocyte,IEL,CL:0002496,,CL:0000084,1,manual,CT2F2F6025243D7121


In [14]:
dfc[dfc['sym'].isin(['IEL', 'Th1', 'Th0', 'Th', 'MAIT', 'Tfh1like'])]

Unnamed: 0,sym,lbl,extid,parent,root,spid,src,id
0,IEL,IEL,CL:0002496,,CL:0000084,1,manual,CT22416FE03B9D6C99
0,MAIT,MAIT,CL:0000940,,CL:0000789,1,manual,CT2AA8AAF80CE1BE08
0,Th,Th,CL:0000912,,CL:0000789,1,manual,CTB7DFCED683E6F0A9
0,Th0,Th0,,CL:0000912,CL:0000789,1,manual,CTBDDE0043F47EDCC1
43,Th1,Th1,CL:0000545,CL:0000912,CL:0000789,1,manual,CTC3A8C3CBC245616A
32,Tfh1like,Tfh1,,CL:0002038,CL:0000789,1,manual,CT1F84133DEE0D3909


### Add Depth

In [15]:
# Create mapping of CL ids to depth
m_depth = df_cl.groupby('extid')['depth'].unique()
assert (m_depth.apply(len) == 1).all()
m_depth = m_depth.apply(lambda v: v[0])
m_depth.head()

extid
CL:0000084    0
CL:0000492    4
CL:0000545    5
CL:0000546    5
CL:0000623    0
Name: depth, dtype: int64

In [16]:
# Assign depth based on exact CL ID but if a cell type does not exist in CL, assign
# depth as the depth of the closest parent + 2 (i.e. assume it is a somewhat immediate child
# -- +2 was chosen because this is the distance between T helper and ThN types)
# capped at max depth + 1 of existing terms
dfc['depth'] = dfc['extid'].map(m_depth).combine_first((dfc['parent'].map(m_depth) + 2).clip(0, m_depth.max() + 1))
assert dfc['depth'].notnull().all()
dfc['depth'] = dfc['depth'].astype(int)
dfc['depth'].value_counts()

3    859
4    361
5    341
1     95
2     89
6     63
Name: depth, dtype: int64

In [17]:
dfc[dfc['sym'].isin(['IEL', 'Th1', 'Th0', 'Th', 'MAIT', 'Tfh', 'Tfh1like', 'Tc', 'Tc0', 'Treg', 'nTreg', 'Treg1', 'NKT', 'iNKT1', 'iNKT17'])]

Unnamed: 0,sym,lbl,extid,parent,root,spid,src,id,depth
0,IEL,IEL,CL:0002496,,CL:0000084,1,manual,CT22416FE03B9D6C99,2
0,NKT,NKT,CL:0000814,,CL:0000789,1,manual,CT459998F048BE65F2,3
0,iNKT1,iNKT1,CL:0000929,CL:0000814,CL:0000789,1,manual,CTA5FE71C9EE6E21F8,4
0,iNKT17,iNKT17,,CL:0000814,CL:0000789,1,manual,CT52C8DAEA84ED4DBB,5
0,MAIT,MAIT,CL:0000940,,CL:0000789,1,manual,CT2AA8AAF80CE1BE08,3
0,Treg,Treg,CL:0000792,,CL:0000789,1,manual,CTB574584AD019ABB8,3
0,nTreg,nTreg,CL:0000903,CL:0000792,CL:0000789,1,manual,CT5D4E277D12A1C208,4
0,Treg1,Treg1,CL:0000901,CL:0000792,CL:0000789,1,manual,CT301EB268E263D718,3
0,Tc,Tc,CL:0000910,,CL:0000789,1,manual,CT231C4EC31B01615E,3
0,Tc0,Tc0,,CL:0000910,CL:0000789,1,manual,CT65B6393015C16630,5


In [18]:
dfc[dfc['depth'] == 6]['lbl'].unique()

array(['Tfh0', 'Tfh1', 'Tfh2', 'Tfh3', 'Tfh9', 'Tfh17', 'Tfh22'],
      dtype=object)

In [19]:
dfc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1808 entries, 0 to 142
Data columns (total 9 columns):
sym       1808 non-null object
lbl       1808 non-null object
extid     813 non-null object
parent    1348 non-null object
root      1808 non-null object
spid      1808 non-null int64
src       1808 non-null object
id        1808 non-null object
depth     1808 non-null int64
dtypes: int64(2), object(7)
memory usage: 141.2+ KB


In [20]:
df_manual = dfc.copy()

### Merge

In [21]:
SRC_PRIORITY = {
    'cl': 50, 
    'manual': 20
}
df = pd.concat([df_cl, df_manual], sort=True)
df = meta.add_source_priority(df, SRC_PRIORITY)
df = meta.add_preferred_ids(df)
assert df['depth'].notnull().all()
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2971 entries, 428 to 0
Data columns (total 12 columns):
depth       2971 non-null int64
extid       1976 non-null object
id          2971 non-null object
lbl         2971 non-null object
parent      1348 non-null object
root        2971 non-null object
spid        2971 non-null int64
src         2971 non-null object
sym         2971 non-null object
priority    2971 non-null int64
prefid      2971 non-null object
enabled     2971 non-null bool
dtypes: bool(1), int64(3), object(8)
memory usage: 281.4+ KB
None


Unnamed: 0,depth,extid,id,lbl,parent,root,spid,src,sym,priority,prefid,enabled
428,0,CL:0000084,CT8850E0B277103D29,T cell,,CL:0000084,1,cl,,50,CT32F14A7EC04D490E,True
699,4,CL:0001047,CT15C28CADCF080753,"CD4-positive, CD25-positive, CCR4-positive, alpha-beta regulatory T cell",,CL:0000084,1,cl,CCR+ Treg,50,CT5229C817BCB2D95C,True
698,3,CL:0002426,CT8064FA1B2BE25960,"CD11b-positive, CD27-positive natural killer cell",,CL:0000623,1,cl,"CD11b-positive, CD27-positive natural killer",50,CTF5A496AACAB527FE,True
698,3,CL:0002426,CTF5A496AACAB527FE,"CD11b-positive, CD27-positive natural killer cell",,CL:0000623,1,cl,"CD11b-positive, CD27-positive natural killer cell",50,CTF5A496AACAB527FE,True
697,2,CL:0000939,CT6B9DB05856A13916,"CD16-positive, CD56-dim natural killer cell",,CL:0000623,1,cl,CD16+CD56+ NK,50,CT7346EBF998A936BB,True


### Filter

In [22]:
df[df['depth'] == 0]

Unnamed: 0,depth,extid,id,lbl,parent,root,spid,src,sym,priority,prefid,enabled
428,0,CL:0000084,CT8850E0B277103D29,T cell,,CL:0000084,1,cl,,50,CT32F14A7EC04D490E,True
461,0,CL:0000623,CT0A9BD2B2809C8A85,natural killer cell,,CL:0000623,1,cl,NK,50,CT6FA6BB70C26F1785,True
461,0,CL:0000623,CTD8ADC8900DC4F03A,natural killer cell,,CL:0000623,1,cl,NK cell,50,CT6FA6BB70C26F1785,True
428,0,CL:0000084,CTB29B51C5F8091493,T cell,,CL:0000084,1,cl,T,50,CT32F14A7EC04D490E,True
441,0,CL:0000084,CT32F14A7EC04D490E,T cell,,CL:0000084,1,cl,T cell,50,CT32F14A7EC04D490E,True
435,0,CL:0000084,CTBE11EE67EE985EA9,T cell,,CL:0000084,1,cl,T lymphocyte,50,CT32F14A7EC04D490E,True
428,0,CL:0000084,CTA3F4025CC0433FA5,T cell,,CL:0000084,1,cl,T-cell,50,CT32F14A7EC04D490E,True
419,0,CL:0000084,CTB4B2FBD42F25E349,T cell,,CL:0000084,1,cl,T-lymphocyte,50,CT32F14A7EC04D490E,True
125,0,CL:0000623,CTB14FD070F22F2138,natural killer cell,,CL:0000623,1,cl,large granular,50,CT6FA6BB70C26F1785,True
125,0,CL:0000623,CTE7A3644DA5232BE5,natural killer cell,,CL:0000623,1,cl,large granular lymphocyte,50,CT6FA6BB70C26F1785,True


In [23]:
# Remove top level records for root cell types (i.e. where the depth is 0) since these will match to too
# many sentences in non-informative contexts
mask = df['depth'] == 0
print('Removing {} records for root records'.format(mask.sum()))
df = df[~mask]
df.info()

Removing 14 records for root records
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2957 entries, 699 to 0
Data columns (total 12 columns):
depth       2957 non-null int64
extid       1962 non-null object
id          2957 non-null object
lbl         2957 non-null object
parent      1348 non-null object
root        2957 non-null object
spid        2957 non-null int64
src         2957 non-null object
sym         2957 non-null object
priority    2957 non-null int64
prefid      2957 non-null object
enabled     2957 non-null bool
dtypes: bool(1), int64(3), object(8)
memory usage: 280.1+ KB


In [24]:
# Ensure that no labels/symbols are empty or null
for c in ['lbl', 'sym']:
    assert len(df[df[c].str.strip().str.len() == 0]) == 0
    assert df[c].notnull().all()
# Ensure that all ids are either null or non-empty
for c in ['extid', 'parent', 'root']:
    assert df[c].apply(lambda v: pd.isnull(v) or (isinstance(v, str) and len(v) > 0)).all()

### Summarize

In [25]:
df['src'].value_counts()

manual    1808
cl        1149
Name: src, dtype: int64

In [26]:
df.groupby(['src', 'enabled']).size().unstack().fillna(0)

enabled,False,True
src,Unnamed: 1_level_1,Unnamed: 2_level_1
cl,4,1145
manual,46,1762


### Export

In [None]:
assert df['id'].value_counts().max() == 1

In [28]:
path = osp.join(META_DATA_DIR, CELL_TYPES + '.csv')
df.to_csv(path, index=False)
path

'/lab/repos/t-cell-relation-extraction/data/meta/cell_types.csv'