In [None]:
# default_exp mgmnt.prep.traceability

# Traceability Preprocessing

> This module comprises all preprocessing techniques for traceability tasks
>
>> Handling traceability links

In [None]:
# export
# Imports
import pandas as pd
import random
import sentencepiece as sp

from fastprogress.fastprogress import master_bar
from functools import partial
from pathlib import Path
from tqdm.notebook import tqdm

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
# export
def get_gt_links(path, language):
    links_df = pd.DataFrame([], columns = [
        'sys', 'src_type', 'trgt_type', 'src_file', 'trgt_file'
    ])
    for fn in path.glob('*.txt'):
        content = str(fn.name).split('.')[0][1:-1]
        content = content.split('-')
        sys, src_type, trgt_type = content[0], content[2], content[4]
        
        with open(fn) as f:
            links = f.readlines()[:-1]
        all_srcs, all_trgts = [], []
        for link in links:
            link = link.split(' ')
            src, trgts = link[0], link[1:]
            all_srcs.extend([src] * len(trgts))
            all_trgts.extend(trgts)
        
        syses = [sys] * len(all_srcs)
        src_types = [src_type] * len(all_srcs)
        trgt_types = [trgt_type] * len(all_srcs)
        all_links = set(zip(syses, src_types, trgt_types, all_srcs, all_trgts))
        df = pd.DataFrame(all_links, columns = [
            'sys', 'src_type', 'trgt_type', 'src_file', 'trgt_file'
        ])
        links_df = pd.concat([links_df, df])
    
    links_df['src_file'] = links_df['src_file'].apply(lambda x: x.strip())
    links_df['trgt_file'] = links_df['trgt_file'].apply(lambda x: x.strip())
    links_df = links_df.drop_duplicates(keep = False)
    return links_df

In [None]:
lang = 'english'
path = Path('../benchmarking/traceability/testbeds'); list(path.glob("*"))

[PosixPath('../benchmarking/traceability/testbeds/translations'),
 PosixPath('../benchmarking/traceability/testbeds/nltk'),
 PosixPath('../benchmarking/traceability/testbeds/bpe'),
 PosixPath('../benchmarking/traceability/testbeds/groundtruth')]

In [None]:
links_df = get_gt_links(path/'groundtruth'/lang, lang)
links_df = links_df.loc[links_df.sys == 'libest']
links_df.head()

Unnamed: 0,sys,src_type,trgt_type,src_file,trgt_file
0,libest,req,src,RQ31.txt,est_server.c
1,libest,req,src,RQ48.txt,est_server_http.c
2,libest,req,src,RQ16.txt,est_server_http.c
3,libest,req,src,RQ34.txt,est_proxy.c
4,libest,req,src,RQ6.txt,est_client.c


In [None]:
# export
def get_non_gt(path, language, gt, n = 1):
    non_links_df = pd.DataFrame([], columns = [
        'sys', 'src_type', 'trgt_type', 'src_file', 'trgt_file'
    ])
    
    srcs = gt[['sys', 'src_type', 'src_file']]
    trgts = gt[['sys', 'trgt_type', 'trgt_file']]
    
    # Sample from the dataframe to introduce randomness and reduce number of links
    srcs = srcs.sample(frac = n)
    trgts = trgts.sample(frac = n)
    for (src_id, src_row), (trgt_id, trgt_row) in zip(srcs.iterrows(), trgts.iterrows()):
#     for src_id, src_row in tqdm(list(srcs.iterrows())[:200]):
#         for trgt_id, trgt_row in tqdm(list(trgts.iterrows())[:200]):
        if src_id == trgt_id or src_row['sys'] != trgt_row['sys']: continue

        row = pd.DataFrame([[
            src_row['sys'], src_row['src_type'],
            trgt_row['trgt_type'], src_row['src_file'],
            trgt_row['trgt_file']]], columns = [
            'sys', 'src_type', 'trgt_type', 'src_file',
            'trgt_file'
        ])
        # Check this row is not a ground truth link
        if len(gt) < len(pd.concat([gt, row]).drop_duplicates(keep = False)):
            non_links_df = pd.concat([non_links_df, row])
            
    non_links_df = non_links_df.drop_duplicates(keep = False)
    return non_links_df

In [None]:
len(links_df), len(links_df.drop_duplicates())

(539, 539)

In [None]:
non_links_df = get_non_gt(path, lang, links_df)
non_links_df.head()

Unnamed: 0,sys,src_type,trgt_type,src_file,trgt_file
0,libest,req,tc,RQ32.txt,us3496.c
0,libest,req,tc,RQ23.txt,us3496.c
0,libest,req,tc,RQ48.txt,us1060.c
0,libest,req,src,RQ28.txt,est.c
0,libest,req,tc,RQ46.txt,us1060.c


In [None]:
len(non_links_df)

191

In [None]:
len(non_links_df)

134

In [None]:
len(pd.concat([links_df, non_links_df])), len(pd.concat([links_df, non_links_df]).drop_duplicates(keep = False))

(673, 673)

In [None]:
assert len(pd.concat([links_df, non_links_df])) == len(pd.concat([links_df, non_links_df]).drop_duplicates(keep=False))

In [None]:
path = Path('test_data'); list(path.glob('*'))

[PosixPath('test_data/LibEST_semeru_format'),
 PosixPath('test_data/java_tokenizer-vocab.json'),
 PosixPath('test_data/[libest-req2src-wmd].csv'),
 PosixPath('test_data/test.model'),
 PosixPath('test_data/[libest-sim-wmd].csv'),
 PosixPath('test_data/.ipynb_checkpoints'),
 PosixPath('test_data/test.vocab'),
 PosixPath('test_data/text.txt'),
 PosixPath('test_data/tst.csv'),
 PosixPath('test_data/config_corpus'),
 PosixPath('test_data/trn.csv'),
 PosixPath('test_data/[libest-req2tc-wmd].csv'),
 PosixPath('test_data/java_tokenizer-merges.txt'),
 PosixPath('test_data/val.csv')]

In [None]:
req2src_wmd_df = pd.read_csv(
    path/'[libest-req2src-wmd].csv',
    header = None, names = ['src_file', 'trgt_file', 'wmd'],
    delimiter = ' '
)
req2src_wmd_df['src_file'] = req2src_wmd_df['src_file'].apply(lambda x: str(Path(x).name).replace('-pre', ''))
req2src_wmd_df['trgt_file'] = req2src_wmd_df['trgt_file'].apply(lambda x: str(Path(x).name).replace('-pre', ''))
req2src_wmd_df['trgt_dtype'] = ['src'] * len(req2src_wmd_df)
req2src_wmd_df.head()

Unnamed: 0,src_file,trgt_file,wmd,trgt_dtype
0,RQ58.txt,est.h,0.340562,src
1,RQ23.txt,est_server_http.c,0.410977,src
2,RQ28.txt,est_server.h,0.386474,src
3,RQ53.txt,est_locl.h,0.447825,src
4,RQ22.txt,est_client_proxy.h,0.532702,src


In [None]:
req2tc_wmd_df = pd.read_csv(
    path/'[libest-req2tc-wmd].csv',
    header = None, names = ['src_file', 'trgt_file', 'wmd'],
    delimiter = ' '
)
req2tc_wmd_df['src_file'] = req2tc_wmd_df['src_file'].apply(lambda x: str(Path(x).name).replace('-pre', ''))
req2tc_wmd_df['trgt_file'] = req2tc_wmd_df['trgt_file'].apply(lambda x: str(Path(x).name).replace('-pre', ''))
req2tc_wmd_df['trgt_dtype'] = ['tc'] * len(req2tc_wmd_df)
req2tc_wmd_df.head()

Unnamed: 0,src_file,trgt_file,wmd,trgt_dtype
0,RQ11.txt,us3496.c,0.533299,tc
1,RQ2.txt,us1005.c,0.334297,tc
2,RQ45.txt,us1060.c,0.304404,tc
3,RQ22.txt,us895.c,0.618677,tc
4,RQ50.txt,us1060.c,0.234525,tc


In [None]:
wmd_df = pd.concat([req2src_wmd_df, req2tc_wmd_df]).reset_index(drop = True)
wmd_df.head()

Unnamed: 0,src_file,trgt_file,wmd,trgt_dtype
0,RQ58.txt,est.h,0.340562,src
1,RQ23.txt,est_server_http.c,0.410977,src
2,RQ28.txt,est_server.h,0.386474,src
3,RQ53.txt,est_locl.h,0.447825,src
4,RQ22.txt,est_client_proxy.h,0.532702,src


In [None]:
# export
def add_wmd(links_df, wmd_df):
    new_links_df = pd.DataFrame([], columns = [
        'sys', 'src_type', 'trgt_type', 'src_file', 'trgt_file', 'wmd'
    ])
    for _, row in wmd_df.iterrows():
        new_row = links_df.loc[(links_df.src_file == row.src_file) & (links_df.trgt_file == row.trgt_file)].copy()
        if len(new_row) > 0:
            new_row['wmd'] = [row.wmd]
            new_links_df = pd.concat([new_links_df, new_row])
    
    new_links_df = new_links_df.sort_values(['src_file', 'trgt_file'])
    return new_links_df

In [None]:
new_links_df = add_wmd(links_df.loc[links_df.sys == 'libest'].copy(), wmd_df)

In [None]:
len(new_links_df), len(wmd_df)

(57, 200)

In [None]:
new_links_df.head()

Unnamed: 0,sys,src_type,trgt_type,src_file,trgt_file,wmd
169,libest,req,src,RQ13.txt,est_locl.h,0.509546
14,libest,req,src,RQ15.txt,est_server.c,0.405939
37,libest,req,src,RQ17.txt,est_locl.h,0.484459
46,libest,req,src,RQ18.txt,est_server_http.c,0.471456
97,libest,req,src,RQ19.txt,est_locl.h,0.50673


# SCRATCH CODE

In [None]:

def check_row(df, row):
    return (df.src_file == row.src_file) & (df.trgt_file == row.trgt_file)

In [None]:
def add_wmd(links_df, wmd_df):
    new_links_df = pd.DataFrame([], columns = [
        'sys', 'src_type', 'trgt_type', 'src_file', 'trgt_file', 'wmd'
    ])
    for _, row in wmd_df.iterrows():
        new_row = links_df.loc[(links_df.src_file == row.src_file) & (links_df.trgt_file == row.trgt_file)].copy()
        if len(new_row) > 0:
            new_row['wmd'] = [row.wmd]
            new_links_df = pd.concat([new_links_df, new_row])
    
    new_links_df = new_links_df.sort_values(['src_file', 'trgt_file'])
    return new_links_df
    
    
#     keys = list(wmd_df.columns[:2].values)
#     i1 = links_df.set_index(keys).index
#     i2 = wmd_df.set_index(keys).index
#     links_df = links_df[~i1.isin(i2)].copy()
#     links_df = links_df.sort_values(['src_file', 'trgt_file'])
    
#     keys = list(links_df.columns[-2:].values)
#     i1 = wmd_df.set_index(keys).index
#     i2 = links_df.set_index(keys).index
#     wmd_df = wmd_df[~i1.isin(i2)].copy()
#     wmd_df = wmd_df.sort_values(['src_file', 'trgt_file'])
    
#     return links_df, wmd_df
#     wmd_df = wmd_df.loc[wmd_df.src_file.isin(links_df.src_file)].copy()
#     wmd_df = wmd_df.loc[wmd_df.trgt_file.isin(links_df.trgt_file)].copy()
    
#     links_df = links_df.loc[links_df.src_file.isin(wmd_df.src_file)].copy()
#     links_df = links_df.loc[links_df.trgt_file.isin(wmd_df.trgt_file)].copy()
#     links_df = links_df.sort_values(['src_file', 'trgt_file'])
    
#     return links_df, wmd_df
    
    
#     links_df['wmd'] = [None] * len(links_df)
#     links_wmd_df = links_df.loc[links_df.src_file.isin(wmd_df.src_file)].copy()
#     links_wmd_df.mask(links_wmd_df.trgt_file.isin)
# #     links_wmd_df.loc[links_df.src_file.isin(wmd_df)].wmd = wmd_df.wmd.values
#     print(links_wmd_df.head())
#     pass

In [None]:
def add_wmd(links_df, wmd_df):
#     wmd_links_df = (links_df
#                     .loc[links_df.src_file.isin(wmd_df.src_file)]
#                     .copy()
#                    )
#     wmd_links_df = wmd_links_df.loc[wmd_links_df.trgt_file.isin(wmd_df.trgt_file)]
#     links_df
    for i, row in wmd_df.iterrows():
        wmd_row = links_df.loc[links_df.src_file == row.src_file].copy()
        wmd_row = wmd_row.loc[wmd_df.trgt_file == row.trgt_file]
        print(wmd_row)
    
    return wmd_links_df

In [None]:

def get_non_gt(path, language, gt, n = 500):
    links = list(zip(gt['src_file'].to_list(), gt['trgt_file'].to_list()))
    syses = list(gt['sys'].to_list() + gt['sys'].to_list())
    files = list(gt['src_file'].to_list() + gt['trgt_file'].to_list())
    files = list(map(Path, files))
    pot_links = list(zip(syses, files))
    random.shuffle(pot_links)
    
    for sys, fn in pot_links:
        print(sys, fn)

In [None]:
get_non_gt(path, lang, links_df)

libest us900.c

libest us748.c
itrust src.edu.ncsu.csc.itrust.dao.mysql.TransactionDAO.java

itrust src.edu.ncsu.csc.itrust.dao.mysql.AuthDAO.java

itrust UC36S1.txt
libest us900.c

itrust UC26S3.txt
itrust UC3S4.txt
itrust UC34S1.txt
itrust UC34E4.txt
itrust UC35S1.txt
itrust UC3S1.txt
libest us3612.c
libest us3496.c
itrust UC5S2.txt
libest est_server_http.c

libest RQ31.txt
libest us3512.c
itrust UC10S1.txt
libest us897.c
itrust src.edu.ncsu.csc.itrust.action.GetUserNameAction.java
libest RQ57.txt
itrust src.edu.ncsu.csc.itrust.action.EmergencyReportAction.java
itrust UC2S2.txt
libest RQ22.txt
libest RQ46.txt
itrust UC36S3.txt
itrust src.edu.ncsu.csc.itrust.action.LabProcUAPAction.java
libest us1060.c

itrust src.edu.ncsu.csc.itrust.validate.PatientValidator.java
itrust UC36S2.txt
libest RQ26.txt
libest est_locl.h

itrust src.edu.ncsu.csc.itrust.validate.PersonnelValidator.java

libest est.c
itrust UC15S1.txt
itrust UC35S1.txt
itrust WebRoot.auth.patient.viewPrescriptionRecords.jsp
l

libest RQ35.txt
libest est_client.c
libest RQ51.txt
itrust src.edu.ncsu.csc.itrust.dao.mysql.AuthDAO.java
itrust UC23S1.txt
itrust src.edu.ncsu.csc.itrust.dao.mysql.OfficeVisitDAO.java
libest us896.c
libest RQ39.txt
itrust src.edu.ncsu.csc.itrust.action.SendMessageAction.java
itrust src.edu.ncsu.csc.itrust.action.GetUserNameAction.java
libest us900.c

libest us748.c
libest us901.c

libest est.c
itrust UC21S1.txt
itrust src.edu.ncsu.csc.itrust.dao.mysql.AuthDAO.java
libest RQ18.txt
itrust UC23E2.txt
libest us3496.c
libest us899.c
libest us3512.c
libest RQ49.txt
itrust src.edu.ncsu.csc.itrust.action.ViewMyAccessLogAction.java
libest RQ34.txt
itrust src.edu.ncsu.csc.itrust.dao.mysql.LOINCDAO.java
itrust WebRoot.auth.admin.hospitalListing.jsp
libest est_client.c
itrust UC15S1.txt
itrust WebRoot.auth.hcp.reply.jsp
libest RQ32.txt
libest RQ35.txt
libest RQ29.txt
libest RQ26.txt
libest RQ49.txt
libest RQ18.txt
itrust UC24.txt
libest RQ13.txt
libest RQ11.txt
libest RQ51.txt
itrust UC34S5.txt
l

In [None]:

def get_non_gt(path, language, gt):
    all_non_links = []
    
    existing_links = ['->'.join(link) for link in zip(gt['from_file'].to_list(), gt['to_file'].to_list())]
    bpe_files = list(path.glob('**/*.bpe'))
    random.shuffle(bpe_files)
    for i in bpe_files[:500]:
        sys = i.parent.parent.name
        from_type = i.parent.name
        if str(from_type) != 'req': continue
        with open(i) as f:
            i_content = f.read().split(' ')
        random.shuffle(bpe_files)
        for j in bpe_files[:500]:
            if i == j: continue
            if '->'.join([i.name, j.name]) in existing_links: continue
            to_type = j.parent.name
            if str(to_type) == 'req': continue
#             if from_type == to_type: continue
            with open(j) as f:
                j_content = f.read().split(' ')
            all_non_links.append([sys, from_type, to_type, i.name, j.name, i_content, j_content])
    
    all_non_links = pd.DataFrame(all_non_links, columns = [
        'sys', 'from_type', 'to_type', 'from_file', 'to_file', 'from_doc', 'to_doc'
    ])
    return all_non_links

In [None]:
def get_ground_truth(path, language):
    all_links = pd.DataFrame([], columns = [
        'sys', 'from_type', 'to_type', 'from_file', 'to_file', 'from_doc', 'to_doc'
    ])
    for fn in path.glob('*.txt'):
        content = str(fn.name).split('.')[0][1:-1]
        content = content.split('-')
        
        sys, from_type, to_type = content[0], content[2], content[4]
        
        with open(fn) as f:
            links = f.read().split('\n')[:-1]
            
        for link in links:
            link = link.split(' ')
            root, children = link[0], link[1:]
            root = Path(root).with_suffix('.bpe').name
            with open(path.parent.parent/'bpe'/language/sys/from_type/root) as f:
                root_content = f.read().split(' ')
            children = [Path(child).with_suffix('.bpe').name for child in children]
            children = [Path('.'.join(str(child).split('.')[-2:])) for child in children]
            for child in children:
                with open(path.parent.parent/'bpe'/language/sys/to_type/child) as f:
                    child_content = f.read().split(' ')
                all_links = all_links.append({'sys': sys,
                                              'from_type': from_type,
                                              'to_type': to_type,
                                              'from_file': root,
                                              'to_file': str(child),
                                              'from_doc': root_content,
                                              'to_doc': child_content},
                                             ignore_index=True)
            
    return all_links

In [None]:
def get_non_gt(path, language, gt, n = 0.5):
    non_links_df = pd.DataFrame([], columns = [
        'sys', 'src_type', 'trgt_type', 'src_file', 'trgt_file'
    ])
    
    srcs = gt[['sys', 'src_type', 'src_file']]
    trgts = gt[['sys', 'trgt_type', 'trgt_file']]
    
    syses = []
    src_types, trgt_types = [], []
    all_srcs, all_trgts = [], []
    
    # Sample from the dataframe to introduce randomness and reduce number of links
    srcs = srcs.sample(frac = n)
    trgts = trgts.sample(frac = n)
    for (src_id, src_row), (trgt_id, trgt_row) in zip(srcs.iterrows(), trgts.iterrows()):
        if src_id == trgt_id or src_row['sys'] != trgt_row['sys']: continue
        
        row = pd.DataFrame([[
            src_row['sys'], src_row['src_type'],
            trgt_row['trgt_type'], src_row['src_file'],
            trgt_row['trgt_file']]], columns = [
            'sys', 'src_type', 'trgt_type', 'src_file',
            'trgt_file'
        ])
        if len(gt) != len(pd.concat([gt, row]).drop_duplicates()):
            non_links_df = pd.concat([non_links_df, row])
        
        
#         new_row = pd.DataFrame([[
#             src_row['sys'], src_row['src_type'],
#             trgt_row['trgt_type'], src_row['src_file'],
#             trgt_row['trgt_file']]
#                             ], columns = [
#         'sys', 'src_type', 'trgt_type', 'src_file', 'trgt_file'
#     ])
        
#         if len(list(filter(lambda row: row[1].equals(new_row), gt.iterrows()))) > 0:
#             print("matched...")
#         if len(gt) == len(pd.concat([gt, new_row]).drop_duplicates()):
#             print("matched...")
#         else: non_links_df = pd.concat([non_links_df, new_row])
#             print(len(gt), len(gt.append(new_row, ignore_index = True).drop_duplicates()))

#         for i, row in gt.iterrows():
#             if row.equals(new_row):
#                 print("matched...")
#         syses.append(src_row['sys'])
#         src_types.append(src_row['src_type'])
#         trgt_types.append(trgt_row['trgt_type'])
        
#         all_srcs.append(src_row['src_file'])
#         all_trgts.append(trgt_row['trgt_file'])
    
#     all_non_links = set(zip(syses, src_types, trgt_types, all_srcs, all_trgts))
#     non_links_df = pd.DataFrame(all_non_links, columns = [
#             'sys', 'src_type', 'trgt_type', 'src_file', 'trgt_file'
#         ])
    
    return non_links_df

In [None]:
path = Path('../benchmarking/traceability/testbeds/nltk'); list(path.glob('*'))
se_csvs = list(path.glob('*pre*.csv'))

In [None]:
df = pd.read_csv(se_csvs[0], names=['file', 'text'], header=None, delimiter = ' ')

In [None]:
df.head()

In [None]:
for fn in path.glob('*pre*.csv'):
    df = pd.read_csv(fn, names=['file', 'text'], header=None)
    print(df.head())

In [None]:
def get_ground_truth(path, language):
    all_links = pd.DataFrame([], columns = [
        'sys', 'from_type', 'to_type', 'from_file', 'to_file', 'from_doc', 'to_doc'
    ])
    for fn in path.glob('*.txt'):
        content = str(fn.name).split('.')[0][1:-1]
        content = content.split('-')
        
        sys, from_type, to_type = content[0], content[2], content[4]
        
        with open(fn) as f:
            links = f.read().split('\n')[:-1]
            
        for link in links:
            link = link.split(' ')
            root, children = link[0], link[1:]
            root = Path(root).with_suffix('.bpe').name
            with open(path.parent.parent/'bpe'/language/sys/from_type/root) as f:
                root_content = f.read().split(' ')
            children = [Path(child).with_suffix('.bpe').name for child in children]
            children = [Path('.'.join(str(child).split('.')[-2:])) for child in children]
            for child in children:
                with open(path.parent.parent/'bpe'/language/sys/to_type/child) as f:
                    child_content = f.read().split(' ')
                all_links = all_links.append({'sys': sys,
                                              'from_type': from_type,
                                              'to_type': to_type,
                                              'from_file': root,
                                              'to_file': str(child),
                                              'from_doc': root_content,
                                              'to_doc': child_content},
                                             ignore_index=True)
            
    return all_links

In [None]:
def get_non_ground_truth(path, language, gt):
    all_non_links = []
    
    existing_links = ['->'.join(link) for link in zip(gt['from_file'].to_list(), gt['to_file'].to_list())]
    bpe_files = list(path.glob('**/*.bpe'))
    random.shuffle(bpe_files)
    for i in bpe_files[:500]:
        sys = i.parent.parent.name
        from_type = i.parent.name
        if str(from_type) != 'req': continue
        with open(i) as f:
            i_content = f.read().split(' ')
        random.shuffle(bpe_files)
        for j in bpe_files[:500]:
            if i == j: continue
            if '->'.join([i.name, j.name]) in existing_links: continue
            to_type = j.parent.name
            if str(to_type) == 'req': continue
#             if from_type == to_type: continue
            with open(j) as f:
                j_content = f.read().split(' ')
            all_non_links.append([sys, from_type, to_type, i.name, j.name, i_content, j_content])
    
    all_non_links = pd.DataFrame(all_non_links, columns = [
        'sys', 'from_type', 'to_type', 'from_file', 'to_file', 'from_doc', 'to_doc'
    ])
    return all_non_links

In [None]:
def gen_gt_ngt(path, lang):
    gt = get_ground_truth(path/'groundtruth'/lang, lang)
    ngt = get_non_ground_truth(path/'bpe'/lang, lang, gt)
    
    return gt, ngt

In [None]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_mgmnt.prep.i.ipynb.
Converted 01_exp.i.ipynb.
Converted 02_mgmnt.db.mongo.ipynb.
Converted 03_repr.i.ipynb.
Converted 04_mining.ir.model.ipynb.
Converted 05_mining.ir.i.ipynb.
Converted 06_benchmark.traceability.ipynb.
Converted 07_repr.roberta.train.ipynb.
Converted 08_exp.info.ipynb.
Converted 09_desc.stats.ipynb.
Converted 10_vis.ipynb.
Converted 11_mgmnt.prep.conv.ipynb.
Converted 12_repr.roberta.eval.ipynb.
Converted 14_mgmnt.prep.bpe.ipynb.
Converted 15_desc.metrics.se.ipynb.
Converted 16_repr.word2vec.train.ipynb.
Converted 17_repr.doc2vec.train.ipynb.
Converted 18_repr.doc2vec.eval.ipynb.
Converted 19_repr.word2vec.eval.ipynb.
Converted 20_benchmark.codegen.ipynb.
Converted 21_inf.i.ipynb.
Converted 22_inf.bayesian.ipynb.
Converted 23_inf.causal.ipynb.
Converted 24_mgmnt.corpus.ipynb.
Converted 25_mgmnt.prep.traceability.ipynb.
Converted 26_mgmnt.prep.nltk.ipynb.
Converted aa_blog.example.ipynb.
Converted ab_templates.example.ipynb.
Converted ac_emp.eval.pp1.rq1.ip