## Детекция названий препаратов

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import re
import time
import nltk
import sklearn
from collections import Counter
from nltk import word_tokenize
import _pickle

In [2]:
#nltk.download('punkt')

In [3]:
data = pd.read_csv('clitical_trials_100000.tsv', delimiter='\t', encoding='utf-8').values

In [4]:
def words_by_coord (inform):
    coords = re.findall("(\d+, \d+)", inform[0][1:-1])
    drugs_coord = []
    names = []
    for i in coords:
        drugs_coord.append(re.split(", ", i))
    for k in drugs_coord:
        names.append(inform[1][int(k[0]):int(k[1])])
    return names

Создадим список всех известных препаратов.

In [5]:
buf = []
for i in data:
    buf += words_by_coord(i)
known_drugs = set (buf)
known_drugs

{'virtually',
 'escherichia',
 'triplet',
 'effectiveness',
 'dimerization',
 'albumin-bound',
 'nontaxane',
 'gun',
 'targretin',
 'arabinoside',
 'ciprofibrate',
 'anti-angiogenetic',
 'hormonotherapy',
 'quinazoline',
 'chemo-sensitive',
 'repurpose',
 'a1',
 'calmodulin',
 'erk-mapk',
 'amiloride',
 'cognitive-behavioural',
 'engineer',
 'el',
 'bexarotene',
 'non-effective',
 'ile',
 'ipl',
 'pyrazolo',
 'anthracycline-and',
 'daylight-photodynamic',
 'her2-directed',
 'boron',
 'standard-fluence',
 'il-21',
 '2.3',
 'h-1',
 'topoisomerase-1',
 'uk',
 'fragment-derived',
 'tumorogenesis',
 'antihormonal',
 'plegridy',
 'dissociation',
 'pan',
 'prognostication',
 'girentuximab-based',
 '53',
 'tomodirect',
 'emollient',
 'electro',
 'epithelium-derived',
 'huvec',
 'mini-pulse',
 'p53-inducible',
 'lv',
 'postimmunotherapy',
 'decoy',
 'multiple-targeted',
 're-treatment',
 'rm.s-e6c10',
 "'uvb",
 'non-invasive',
 'mri-based',
 'rtk',
 'their',
 'dpp-iv',
 'alendronate',
 'fiber',

Видим, что очень много странных меток. Посмотрим на частотность.

In [6]:
Drugs_fr = Counter(buf)

Для начала попробуем простые модели.
Сделаем фичи: первая буква, вторая, предпоседняя и последняя в названии препаратов. Не будем делать поправку на то, сколько раз слово встречается в исходных данных. Это позволит нам выявить закономерности в названиях самих препаратов, и избавиться от неверных слов типа терапия. Контекст пока никак не учитывается.
Изучив остальные слова обнаруживаем, что в названиях препаратов не содержатся некоторые символы.  
Итак: выделяем признаки из названий препаратов, затем повторяем это для всех остальных слов, предварительно удалив заведомо плохие.

In [7]:
list_drugs = list(known_drugs)
features = np.empty((len(list_drugs),4), dtype=str)
for i, word in enumerate(np.array(list_drugs)):
    if (len(word) > 2):
        features[i,0] = word[0]
        features[i,1] = word[1]
        features[i,2] = word[-2] 
        features[i,3] = word[-1]
encoder = sklearn.preprocessing.OneHotEncoder()
X_drugs = encoder.fit_transform(features)

Теперь добавим длину слова - названия препаратов очень часто большие. Также найдем количество букв.

In [8]:
features = np.empty((len(list_drugs), 2))
for i, word in enumerate(np.array(list_drugs)):
    features[i, 0] = len(word)
    features[i, 1] = len(re.findall('[a-zA-Z]', word))
X_drugs = sp.sparse.hstack((X_drugs,features))

In [9]:
X_drugs.shape

(10842, 171)

Попробуем линейную модель и, несмотря на разрежеенность матрицы признаков, случайный лес. Но для этого нам необходимо выбрать функцию потерь и набрать множество слов, которые не являются названиями препаратов. После этого мы будем иметь задачу бинарной классификации.

In [29]:
for i in data:
    if (re.findall('\*', i[1])):
        print (i[1])

chronic idiopathic diarrhea be the passage of loose stool > 3 time daily , or a stool weight > 200 g d , persist for > 4 week without clear clinical cause . patient refractory to standard anti-diarrhetic have limit treatment option . somatostatin analogue have the ability to reduce gastrointestinal secretion and motility . this study evaluate the efficacy and safety of lanreotide autogel ( * ) 120 mg in chronic idiopathic diarrhea . other anti-diarrhetic be not allow during the study and be stop at screening . patient receive lanreotide autogel 120 mg at baseline and day 28 . stool frequency and consistency ( bristol stool scale ) be record ; quality of life ( qol ) be assess use the 36 item short form health survey and irritable bowel syndrome qol questionnaire ; adverse event be monitor . the primary outcome be the proportion of patient with a reduction of ?50 % or normalization to a mean of ?3 stool d at day 28 . thirty-three patient with > 3 stool d at baseline be include ; mean ( 

pregnancy-associated plasma protein-a ( pappa ) , also know as pappalysin , be a member of the insulin-like growth factor ( igf ) family . pappa act as a protease , cleave igf inhibitor , i.e. , igf binding protein ( igfbp ) , thereby set free igf . the insulin igf-axis be involve in cancer in general and in ewing sarcoma ( es ) in particular . es be a highly malignant bone tumor characterize by early metastatic spread . pappa be associate with various cancer . it be overexpresse and require for proliferation in es . pappa also stimulate normal bone growth . we isolate hla-a*02:01 ( + ) peptide-restricted t cell from a*02:01 ( - ) healthy donor direct against pappa , generate by prim with a*02:01 ( + ) pappa peptide load dendritic cell . after tcr identification , retrovirally tcr transduce cd8 ( + ) t cell be assess for their in vitro specificity and in vivo efficacy in human es bear rag2 ( - - ) ?c ( - - ) mouse . engraftment in mouse and tumor infiltration of tcr transgenic t cell i

to evaluate the effect of genetic polymorphism of drug metabolize enzyme on the pharmacokinetic of cyclophosphamide and its active metabolite , 4 hydroxycyclophosphamide , and on the pharmacodynamic . one hundred and three japanese patient with malignant lymphoma or breast cancer treat with cyclophosphamide ( 500 750 mg m ) participate in this study . the plasma concentration of cyclophosphamide and 4 hydroxycyclophosphamide be determine by high-performance liquid chromatography , and pharmacokinetic parameter be calculate . the genotype of cyp2b6 , cyp2c19 , cyp3a4 , cyp3a5 , aldh1a1 , gst gene be determine by allele-specific polymerase chain reaction or polymerase chain reaction-restriction-fragment length polymorphism . a large interindividual difference ( 54 fold ) be observe in the area under the curve ratio of 4 hydroxycyclophosphamide cyclophosphamide calculate as the metabolic index . we first prove that leukocytopenia and neutropenia be significantly ( p < 0.01 ) relate to the

## Длительное вычисление, можно просто выгрузить из файла.

In [27]:
all_words = set()
for i in data:
    tokenized_word = word_tokenize(i[1])
    for q, word in enumerate(tokenized_word):
        if(re.findall("\*", word)):
            tokenized_word.pop(q)
    all_words.update(tokenized_word)
all_words.remove(".")
all_words.remove(",")

KeyboardInterrupt: 

In [21]:
with open('all_words.txt', 'wb') as file:
    _pickle.dump(all_words, file)

In [22]:
with open('all_words.txt', 'rb') as file:
    all_words = set(_pickle.load(file))

In [23]:
non_drugs = all_words.difference(known_drugs)

In [24]:
non_drugs

{'4d-listmode-pet-based',
 'nok',
 '607',
 'rasterscanning',
 'flot',
 'cytarabine-associated',
 'sudan',
 'non-cdx',
 'complex-i',
 'eturbt',
 'cutaneoust-cell',
 'profit',
 'pcdna',
 "'sandwiche",
 '2-529',
 '2pn',
 "l'esame",
 'attribution-noncommercial-noderivs',
 '.455',
 'oligoasthenospermia',
 'leuprolide-downregulated',
 'microtubule-binding',
 'llium',
 'pcad',
 '169.31',
 'depolarise',
 'skp',
 'lrfa',
 'post-allobmt',
 'rad30',
 'palbociclib-letrozole',
 'site-to-site',
 'percutaneus',
 'dimethylpyridin',
 'cellgro',
 'cinnamon',
 'enzyme-labile',
 'pchbcec',
 'immunohistological',
 'near-tissue',
 'fh535',
 'fhp',
 'g.15582c',
 'o-atom',
 'sim-nif',
 'mrdt',
 'ethan-1-ol',
 'diseasefree',
 'mtx-containing',
 'courage',
 'nisdb',
 'slb',
 'proenkephalin',
 'eki-785',
 'serine-replaced',
 'isrctn38231611',
 'grp94-positive',
 'siepp',
 'nonactivated',
 'ostensible',
 "'pyrexia",
 'b-pet',
 'aids-relative',
 'forty-five',
 'universitario-albacete',
 'malleolar',
 'vein',
 'mot

Теперь прогоним слова не являющиеся названиями препаратов через те же преобразования и получим матрицу признаков.

In [25]:
list_non_drugs = list(non_drugs)
features = np.empty((len(list_non_drugs),4), dtype=str)
for i, word in enumerate(np.array(list_non_drugs)):
    if (len(word) > 2):
        features[i,0] = word[0]
        features[i,1] = word[1]
        features[i,2] = word[-2] 
        features[i,3] = word[-1]
    else:
        features[i,0] = ''
        features[i,1] = ''
        features[i,2] = ''
        features[i,3] = ''
X_non_drugs = encoder.transform(features).toarray()

features = np.empty((len(list_drugs), 2))
for i, word in enumerate(np.array(list_non_drugs)):
    features[i, 0] = len(word)
    features[i, 1] = len(re.findall('[a-zA-Z]', word))
X_non_drugs = sp.sparse.hstack((X_non_drugs,features))

ValueError: Found unknown categories ['*', ',', ':', '=', '_', '|', '~'] in column 0 during transform

In [None]:
with open('features_non_drugs.txt', 'wb') as file:
    _pickle.dump(X_non_drugs, file)
with open('features_non_grugs.txt', 'rb') as file:
    X_non_drugs = _pickle.load(file)