## Cell Type Integration

In [1]:
import os.path as osp
import pandas as pd
import numpy as np
from tcre import meta
from tcre import lib
from tcre.lib import SPECIES_HUMAN_ID, CELL_TYPES
from tcre.meta import ID_TYP_CT
from tcre.env import *

### Load Cell Ontology

In [2]:
SRC = 'cl'
path = osp.join(META_DATA_DIR, 'raw', 'cl.raw.csv')
df = pd.read_csv(path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 7 columns):
desc          700 non-null object
id            700 non-null object
label         700 non-null object
root          700 non-null object
syn           700 non-null object
syn_typ       700 non-null object
syn_typ_id    700 non-null int64
dtypes: int64(1), object(6)
memory usage: 38.4+ KB


In [3]:
# Make sure that each group of synonyms has a record with symbol equal to label
assert (df.groupby('label').apply(lambda g: len(g[g['syn'] == g['label']])) == 1).all()

In [4]:
df_cl_raw = df.copy()

In [5]:
df = df[['id', 'label', 'syn', 'root']].rename(columns={'id': 'extid', 'syn': 'sym', 'label': 'lbl'}).copy()
df = df.assign(src=SRC, spid=SPECIES_HUMAN_ID, parent=None, lvl=None)
df['id'] = meta.get_ids(df, ID_TYP_CT)
df.head()

Unnamed: 0,extid,lbl,sym,root,src,spid,parent,lvl,id
0,CL:0000933,type II NK T cell secreting interleukin-4,type II NKT cell secreting interleukin-4,CL:0000084,cl,1,,,CT2A162A51CA50D3E2
1,CL:0000932,type II NK T cell secreting interferon-gamma,type II NKT cell secreting interferon-gamma,CL:0000084,cl,1,,,CT9D60C0282C217F30
2,CL:0000922,type II NK T cell,type II NKT cell,CL:0000084,cl,1,,,CT1033047CAD81DC57
3,CL:0000933,type II NK T cell secreting interleukin-4,type II NK T-lymphocyte secreting interleukin-4,CL:0000084,cl,1,,,CT1AECB450297C08F0
4,CL:0000932,type II NK T cell secreting interferon-gamma,type II NK T-lymphocyte secreting interferon-g...,CL:0000084,cl,1,,,CT452EC16F00882078


In [6]:
def get_substitutions(df):
    return pd.concat([
        df[df['sym'].str.match('.*' + ptn + '.*')].assign(sym=lambda df: df['sym'].str.replace(ptn, ''))
        for ptn in ['T.cell[s]?', '.cell[s]?', 'T.lymphocyte[s]?', '.lymphocyte[s]?']
    ])
df = pd.concat([df.assign(sub=False), get_substitutions(df).assign(sub=True)])
len(df)

1729

In [7]:
df['sub'].value_counts()

True     1029
False     700
Name: sub, dtype: int64

In [8]:
# Check substitutions for a single case
df[df['sym'].str.contains('Th17')]

Unnamed: 0,extid,lbl,sym,root,src,spid,parent,lvl,id,sub
42,CL:0001051,"CD4-positive, CXCR3-negative, CCR6-negative, a...",non-Th1/Th17 CD4+ T cell,CL:0000084,cl,1,,,CTFA121F1DA4774659,False
365,CL:0000899,T-helper 17 cell,Th17 cell,CL:0000084,cl,1,,,CTEB678404BA18C220,False
366,CL:0000899,T-helper 17 cell,Th17 T-lymphocyte,CL:0000084,cl,1,,,CTE54915A0B3E4C31A,False
367,CL:0000899,T-helper 17 cell,Th17 T-cell,CL:0000084,cl,1,,,CT0C1ECB7486EF204C,False
368,CL:0000899,T-helper 17 cell,Th17 T lymphocyte,CL:0000084,cl,1,,,CT18731E81B15685DF,False
369,CL:0000899,T-helper 17 cell,Th17 T cell,CL:0000084,cl,1,,,CT7FF0459CEF1226F1,False
370,CL:0000899,T-helper 17 cell,Th17 CD4+ T cell,CL:0000084,cl,1,,,CT3DCDDD9C2EDA45AD,False
42,CL:0001051,"CD4-positive, CXCR3-negative, CCR6-negative, a...",non-Th1/Th17 CD4+,CL:0000084,cl,1,,,CTFA121F1DA4774659,True
367,CL:0000899,T-helper 17 cell,Th17,CL:0000084,cl,1,,,CT0C1ECB7486EF204C,True
369,CL:0000899,T-helper 17 cell,Th17,CL:0000084,cl,1,,,CT7FF0459CEF1226F1,True


In [9]:
df_cl = df.drop('sub', axis=1).copy()
df_cl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1729 entries, 0 to 666
Data columns (total 9 columns):
extid     1729 non-null object
lbl       1729 non-null object
sym       1729 non-null object
root      1729 non-null object
src       1729 non-null object
spid      1729 non-null int64
parent    0 non-null object
lvl       0 non-null object
id        1729 non-null object
dtypes: int64(1), object(8)
memory usage: 135.1+ KB


### Manual Entries

In [10]:
SRC = 'manual'

# Not found in CL:
# - Tissue resident memory
# - Stem memory
# - Treg17
# - Follicular regulatory
# - Peripheral Treg
# - iNKT17 (iNKT1, iNKT2 are there)
# - Tc0, Tc3, Tc9, Tc22 (Tc1, Tc2, and Tc17 are there)
# - Th0, Th3 (Th1, Th2, Th9, Th17, and Th22 are there)
# - Tfh* (Tfh0, Tfh1, Tfh2, Tfh17 are all absent)

def search(term, df):
    term = term.lower()
    ids = df[df['syn'].str.lower().apply(lambda v: term in v)]['id'].unique()
    df = df.groupby('label').apply(lambda g: g if g['id'].isin(ids).any() else None)
    return df[df['syn_typ'] == 'label']
        
pd.set_option('display.max_colwidth', 200)
search('th17', df_cl_raw)

Unnamed: 0,desc,id,label,root,syn,syn_typ,syn_typ_id
424,"CD4-positive, alpha-beta T cell with the phenotype RORgamma-t-positive, CXCR3-negative, CCR6-positive, and capable of producing IL-17.",CL:0000899,T-helper 17 cell,CL:0000084,T-helper 17 cell,label,5.0
619,"A CD4-positive, alpha-beta T cell that has the phenotype CXCR3-negative, CCR6-negative.",CL:0001051,"CD4-positive, CXCR3-negative, CCR6-negative, alpha-beta T cell",CL:0000084,"CD4-positive, CXCR3-negative, CCR6-negative, alpha-beta T cell",label,5.0


In [11]:
dfm = pd.read_csv(osp.join(META_DATA_DIR, 'raw', lib.CELL_TYPES + '.manual.csv'), sep=',', comment='#', error_bad_lines=False, warn_bad_lines=False)
for c in dfm.select_dtypes(include=[object]):
    dfm[c] = dfm[c].apply(lambda v: None if pd.isnull(v) or not v.strip() else v.strip())
dfm.head()

Unnamed: 0,lbl,lvl,cl0,cl1,cl2,syms,lkp
0,IEL,1,CL:0002496,,CL:0000084,IELs|Intraepithelial-lymphocyte|Intraepithelial-lymphocytes,
1,TN,2,CL:0000898,,CL:0000789,T-naïve|naïve-T,
2,TMEM,2,CL:0000813,,CL:0000789,T-mem|memory-T|mem-T,
3,TEMRA,2,CL:0001062,,CL:0000789,T-emra|emra-T,
4,TCM,2,CL:0000904,,CL:0000789,T-cm|cm-T|central-memory-T,


In [12]:
import unidecode

def parse(r):
    df = []
    lbl, lvl = r['lbl'].strip(), int(r['lvl'])
    syms = r['syms'].split('|')
    extid = r['cl0']
    parid = r['cl1']
    rootid = r['cl2']
    assert not (pd.isnull(extid) and pd.isnull(parid))
    assert not pd.isnull(rootid)
    lkp = {None:None} if pd.isnull(r['lkp']) else dict([v.split('=') for v in r['lkp'].split('|')])
    assert len(lkp) > 0

    def append(sym, lbl, extid):
        df.append((sym, lbl, lvl, extid, parid, rootid))
        
    for lk, lv in lkp.items():
        lv = None if lv is None or not lv.strip() else lv.strip()
        lblf = lbl.format(lk) if lk else lbl
        extidf = lv or extid
        append(lblf, lblf, extidf)
        for alias in syms:
            for sym in [alias, unidecode.unidecode(alias)]: 
                symf = sym.format(lk) if lk else sym
                append(symf, lblf, extidf)
                append(symf.replace('-', ' '), lblf, extidf)
                append(symf.replace('-', ''), lblf, extidf)
    return pd.DataFrame(df, columns=['sym', 'lbl', 'lvl', 'extid', 'parent', 'root']).drop_duplicates()

def flatten(df):
    return pd.concat([parse(r) for _, r in df.iterrows()])
    
def enrich(df):
    # Add -cell[s] -lymphocyte[s] to the end of each symbol to cover cases where 
    # the spans are not broken into separate tokens (e.g. "follicular regulatory T-cells")
    df = pd.concat([df] + [
        df[df['sym'].str.endswith('T')].assign(sym=lambda df: df['sym'] + suffix)
        for suffix in ['-cell', '-cells', '-lymphoctye', '-lymphocytes']
    ])
    df = df.assign(spid=SPECIES_HUMAN_ID, src=SRC)
    df['id'] = meta.get_ids(df, ID_TYP_CT)
    df = df.drop_duplicates()
    return df

dfc = enrich(flatten(dfm))
dfc.head()

Unnamed: 0,sym,lbl,lvl,extid,parent,root,spid,src,id
0,IEL,IEL,1,CL:0002496,,CL:0000084,1,manual,CT22416FE03B9D6C99
1,IELs,IEL,1,CL:0002496,,CL:0000084,1,manual,CTB660BADB779FAA4E
7,Intraepithelial-lymphocyte,IEL,1,CL:0002496,,CL:0000084,1,manual,CT54F82A76F87DFC87
8,Intraepithelial lymphocyte,IEL,1,CL:0002496,,CL:0000084,1,manual,CT77F7FFA741C8DF8A
9,Intraepitheliallymphocyte,IEL,1,CL:0002496,,CL:0000084,1,manual,CT2F2F6025243D7121


In [13]:
dfc[dfc['sym'].isin(['IEL', 'Th1', 'Th0', 'Th', 'MAIT', 'Tfh1like'])]

Unnamed: 0,sym,lbl,lvl,extid,parent,root,spid,src,id
0,IEL,IEL,1,CL:0002496,,CL:0000084,1,manual,CT22416FE03B9D6C99
0,MAIT,MAIT,2,CL:0000940,,CL:0000789,1,manual,CT2AA8AAF80CE1BE08
0,Th,Th,2,CL:0000912,,CL:0000789,1,manual,CTB7DFCED683E6F0A9
0,Th0,Th0,3,,CL:0000912,CL:0000789,1,manual,CTBDDE0043F47EDCC1
43,Th1,Th1,3,CL:0000545,CL:0000912,CL:0000789,1,manual,CTC3A8C3CBC245616A
32,Tfh1like,Tfh1,3,,CL:0002038,CL:0000789,1,manual,CT1F84133DEE0D3909


In [14]:
df_manual = dfc.copy()

### Merge

In [15]:
SRC_PRIORITY = {
    'cl': 50, 
    'manual': 20
}
df = pd.concat([df_cl, df_manual], sort=True)
df = meta.add_source_priority(df, SRC_PRIORITY)
df = meta.add_preferred_ids(df)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3463 entries, 428 to 0
Data columns (total 12 columns):
extid       2476 non-null object
id          3463 non-null object
lbl         3463 non-null object
lvl         1734 non-null object
parent      1274 non-null object
root        3463 non-null object
spid        3463 non-null int64
src         3463 non-null object
sym         3463 non-null object
priority    3463 non-null int64
prefid      3463 non-null object
enabled     3463 non-null bool
dtypes: bool(1), int64(2), object(9)
memory usage: 328.0+ KB
None


Unnamed: 0,extid,id,lbl,lvl,parent,root,spid,src,sym,priority,prefid,enabled
428,CL:0000084,CTA3F4025CC0433FA5,T cell,,,CL:0000084,1,cl,,50,CT32F14A7EC04D490E,True
441,CL:0000084,CT32F14A7EC04D490E,T cell,,,CL:0000084,1,cl,,50,CT32F14A7EC04D490E,False
419,CL:0000084,CTB4B2FBD42F25E349,T cell,,,CL:0000084,1,cl,,50,CT32F14A7EC04D490E,False
435,CL:0000084,CTBE11EE67EE985EA9,T cell,,,CL:0000084,1,cl,,50,CT32F14A7EC04D490E,False
699,CL:0001047,CT15C28CADCF080753,"CD4-positive, CD25-positive, CCR4-positive, alpha-beta regulatory T cell",,,CL:0000084,1,cl,CCR+ Treg,50,CT5229C817BCB2D95C,True


### Filter

In [16]:
df[df['extid'] == 'CL:0000084']

Unnamed: 0,extid,id,lbl,lvl,parent,root,spid,src,sym,priority,prefid,enabled
428,CL:0000084,CTA3F4025CC0433FA5,T cell,,,CL:0000084,1,cl,,50,CT32F14A7EC04D490E,True
441,CL:0000084,CT32F14A7EC04D490E,T cell,,,CL:0000084,1,cl,,50,CT32F14A7EC04D490E,False
419,CL:0000084,CTB4B2FBD42F25E349,T cell,,,CL:0000084,1,cl,,50,CT32F14A7EC04D490E,False
435,CL:0000084,CTBE11EE67EE985EA9,T cell,,,CL:0000084,1,cl,,50,CT32F14A7EC04D490E,False
428,CL:0000084,CTA3F4025CC0433FA5,T cell,,,CL:0000084,1,cl,T,50,CT32F14A7EC04D490E,True
441,CL:0000084,CT32F14A7EC04D490E,T cell,,,CL:0000084,1,cl,T,50,CT32F14A7EC04D490E,False
419,CL:0000084,CTB4B2FBD42F25E349,T cell,,,CL:0000084,1,cl,T,50,CT32F14A7EC04D490E,False
435,CL:0000084,CTBE11EE67EE985EA9,T cell,,,CL:0000084,1,cl,T,50,CT32F14A7EC04D490E,False
441,CL:0000084,CT32F14A7EC04D490E,T cell,,,CL:0000084,1,cl,T cell,50,CT32F14A7EC04D490E,True
435,CL:0000084,CTBE11EE67EE985EA9,T cell,,,CL:0000084,1,cl,T lymphocyte,50,CT32F14A7EC04D490E,True


In [17]:
# Remove top level "T cell" records
mask = df['extid'] == 'CL:0000084'
print('Removing {} records for root record "T Cell"'.format(mask.sum()))
df = df[~mask]
df.info()

Removing 12 records for root record "T Cell"
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3451 entries, 699 to 0
Data columns (total 12 columns):
extid       2464 non-null object
id          3451 non-null object
lbl         3451 non-null object
lvl         1734 non-null object
parent      1274 non-null object
root        3451 non-null object
spid        3451 non-null int64
src         3451 non-null object
sym         3451 non-null object
priority    3451 non-null int64
prefid      3451 non-null object
enabled     3451 non-null bool
dtypes: bool(1), int64(2), object(9)
memory usage: 326.9+ KB


In [18]:
# Ensure that no labels/symbols are empty or null
for c in ['lbl', 'sym']:
    assert len(df[df[c].str.strip().str.len() == 0]) == 0
    assert df[c].notnull().all()
# Ensure that all ids are either null or non-empty
for c in ['extid', 'parent', 'root']:
    assert df[c].apply(lambda v: pd.isnull(v) or (isinstance(v, str) and len(v) > 0)).all()

### Summarize

In [19]:
df['src'].value_counts()

manual    1734
cl        1717
Name: src, dtype: int64

In [20]:
df.groupby(['src', 'enabled']).size().unstack().fillna(0)

enabled,False,True
src,Unnamed: 1_level_1,Unnamed: 2_level_1
cl,564,1153
manual,41,1693


In [26]:
df[df['sym'].str.lower().str.contains('nkt')]

Unnamed: 0,extid,id,lbl,lvl,parent,root,spid,src,sym,priority,prefid,enabled
657,CL:0000924,CT948ADF339CF9FFA6,"CD4-negative, CD8-negative type I NK T cell",,,CL:0000084,1,cl,"CD4-negative, CD8-negative type I NKT",50,CT07F1F2C6A0F264DF,True
657,CL:0000924,CT948ADF339CF9FFA6,"CD4-negative, CD8-negative type I NK T cell",,,CL:0000084,1,cl,"CD4-negative, CD8-negative type I NKT cell",50,CT07F1F2C6A0F264DF,True
656,CL:0000929,CT0C96066BBF0089A3,"CD4-negative, CD8-negative type I NK T cell secreting interferon-gamma",,,CL:0000084,1,cl,"CD4-negative, CD8-negative type I NKT cell secreting interferon-gamma",50,CT0A759254BC64362B,True
655,CL:0000930,CTC0D0F70B61D9EF80,"CD4-negative, CD8-negative type I NK T cell secreting interleukin-4",,,CL:0000084,1,cl,"CD4-negative, CD8-negative type I NKT cell secreting interleukin-4",50,CT2C066E491AC47785,True
656,CL:0000929,CT0C96066BBF0089A3,"CD4-negative, CD8-negative type I NK T cell secreting interferon-gamma",,,CL:0000084,1,cl,"CD4-negative, CD8-negative type I NKT secreting interferon-gamma",50,CT0A759254BC64362B,True
655,CL:0000930,CTC0D0F70B61D9EF80,"CD4-negative, CD8-negative type I NK T cell secreting interleukin-4",,,CL:0000084,1,cl,"CD4-negative, CD8-negative type I NKT secreting interleukin-4",50,CT2C066E491AC47785,True
631,CL:0000923,CT563AF057F140E448,CD4-positive type I NK T cell,,,CL:0000084,1,cl,CD4-positive type I NKT,50,CTF06FF20730C1639C,True
631,CL:0000923,CT563AF057F140E448,CD4-positive type I NK T cell,,,CL:0000084,1,cl,CD4-positive type I NKT cell,50,CTF06FF20730C1639C,True
630,CL:0000926,CT9EB26456E45D8922,CD4-positive type I NK T cell secreting interferon-gamma,,,CL:0000084,1,cl,CD4-positive type I NKT cell secreting interferon-gamma,50,CT6571984FFC9FAA3A,True
629,CL:0000927,CT543C63B59817A4AB,CD4-positive type I NK T cell secreting interleukin-4,,,CL:0000084,1,cl,CD4-positive type I NKT cell secreting interleukin-4,50,CT502941FC937D8B88,True


### Export

In [27]:
path = osp.join(META_DATA_DIR, CELL_TYPES + '.csv')
df.to_csv(path, index=False)
path

'/lab/repos/t-cell-relation-extraction/data/meta/cell_types.csv'