<a id='sec0'></a>
# Feature Engineering2
- Importing Data
- <a href='#sec1'>Exemplary Text Analysis</a>
  - <a href='#sec1_1'>Old way</a>
  - <a href='#sec1_2'>Some useful regex and lists</a>
  - <a href='#sec1_3'>Replace periods, commas, hyphens, brackets with a space, and then tokenize</a>


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from nltk import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

sns.set_context("paper")
%matplotlib inline

<b>Importing train_text</b>

In [2]:
class_train = pd.read_csv('train_variants')
text_train = pd.read_csv("train_text", sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])

In [3]:
print(class_train.head())
print(text_train.head())

   ID    Gene             Variation  Class
0   0  FAM58A  Truncating Mutations      1
1   1     CBL                 W802*      2
2   2     CBL                 Q249E      2
3   3     CBL                 N454D      3
4   4     CBL                 L399V      4
   ID                                               Text
0   0  Cyclin-dependent kinases (CDKs) regulate a var...
1   1   Abstract Background  Non-small cell lung canc...
2   2   Abstract Background  Non-small cell lung canc...
3   3  Recent evidence has demonstrated that acquired...
4   4  Oncogenic mutations in the monomeric Casitas B...


<a id='sec1'></a>
# More feature extractions  (<a href='#sec0'>Back To Top</a>)

In [4]:
%%time
txt = ''
for i in range(len(text_train)):
    text = text_train.loc[i, 'Text'] + ''
    txt += text

text_white = txt.encode().decode()  # copy a string?!
text_white = text_white.replace('"', ' ')
text_white = text_white.replace('.', ' ')
text_white = text_white.replace('/', ' ')
text_white = text_white.replace('\'', ' ')
text_white = text_white.replace('_', ' ')    # This should be done after _pos, etc checked
text_white = text_white.replace('-', ' ')    # This should be done after -pos, etc checked
text_white = text_white.replace('=', ' ')
text_white = text_white.replace('\n', ' ')
text_white = text_white.replace('\\n', ' ')
text_white = text_white.replace('\'', ' ')
text_white = re.sub(' +',' ', text_white) 
text_white = text_white.replace('\'', ' ')
text_white = text_white.replace('(', ' ')
text_white = text_white.replace(')', ' ')
text_white = text_white.replace('[', ' ')
text_white = text_white.replace(']', ' ')
text_white = text_white.replace('{', ' ')
text_white = text_white.replace('}', ' ')    

word_tokens = word_tokenize(text_white)

stop_words = set(stopwords.words('english'))
word_tokens = [word for word in word_tokens if word not in stop_words]

CPU times: user 2min 2s, sys: 3.4 s, total: 2min 5s
Wall time: 2min 5s


<b>Some useful lists</b>

In [30]:
races = ['asian', 'hispanic', 'african', 'caucasian', 'native american', 'indian',
         'pacific islander', 'black', 'white', 'latino']

biowords = ['promoter', 'enhancer', 'neuron', 'marrow',
            'loss-of-heterozygosity', 'loss of heterozygosity', 'progenitor', 'pluripotent']

organs = ['brain', 'liver','skin', 'stomach', 'gastric', 'intestine', 'intestinal', 'colon', 'rectum', 'rectal',
          'prostate', 'breast', 'ovary', 'ovarian', 'kidney', 'renal', 'adrenal', 'gland',
          'thyroid', 'esophogus', 'esophogal', 'bone', 'spinal', 'heart', 'cardiac', 'pancreas', 'pancreatic',
          'spleen', 'splenic', 'bladder', 'gallbladder', 'lung', 'parathyroid', 'pituitary',
          'cervix', 'cervical', 'skeletal']

In [8]:
commoners = ['RT', 'PCR', 'RT-PCR', 'DNA', 'cDNA', 'RNA', 'mRNA', 'siRNA', 'shRNA', 'protein', 'basis'
             'cell', 'cancer', 'CHIP', 'FISH', 'SDS-PAGE', 'UK', 'USA', 'GST', 'GFP', 'SDS', 'PAGE',
             'qPCR', 'PBS', 'TBS', 'DTT', 'BSA', 'HSA', 'HCl', 'NCBI', 'PBST', 'ANOVA', 'RIKEN',
             'COHORT', 'OUTCOME', 'AIRWAY', 'EMSA', 'analysis', 'electrophoresis', 'hypothesis', 'hypothetical',
             'nonparametric', 'Malaysia', 'Asia', 'Indonesia', 'Pan-Asia', 'Russia', 'Romania', 'Media', 'media',
             'australasia', 'tunisia']

<b>Words for races</b>

In [31]:
%%time
txt_for_races = [word.lower() for word in word_tokens if word.lower() in races]
txt_for_races = [re.sub(r's$', '', word) for word in txt_for_races]
print(set(txt_for_races))

{'black', 'african', 'caucasian', 'white', 'latino', 'indian', 'hispanic', 'asian'}
CPU times: user 4.57 s, sys: 0 ns, total: 4.57 s
Wall time: 4.57 s


<b>Words for organs</b>

In [28]:
%%time
txt_for_organs = [word.lower() for word in word_tokens if word.lower() in organs]
txt_for_organs = [re.sub(r's$', '', word) for word in txt_for_organs]
print(set(txt_for_organs))

{'ovary', 'pancrea', 'intestinal', 'liver', 'adrenal', 'stomach', 'renal', 'spleen', 'skin', 'heart', 'cervix', 'pancreatic', 'gastric', 'intestine', 'rectal', 'lung', 'thyroid', 'splenic', 'bone', 'rectum', 'prostate', 'cervical', 'cardiac', 'skeletal', 'spinal', 'parathyroid', 'ovarian', 'gallbladder', 'colon', 'brain', 'pituitary', 'breast', 'bladder', 'kidney', 'gland'}
CPU times: user 11 s, sys: 1 ms, total: 11 s
Wall time: 11 s


<b>Words that end with -oma(s)</b>

In [12]:
%%time
pattern1 = r"[A-Za-z]+omas?$"
txt_with_omas = [word.lower() for word in word_tokens if re.search(pattern1, word)]
txt_with_omas = [re.sub(r's$', '', word) for word in txt_with_omas]
print(set(txt_with_omas))

{'lymphoma', 'neurofibrosarcoma', 'granuloma', 'tumour–stroma', 'luteoma', 'insulinoma', 'nonmelanoma', 'paraganglioma', 'thoma', 'medullobastoma', 'brosarcoma', 'adenolymphoma', 'acanthoma', 'mixosarcoma', 'emangioendothelioma', 'antimelanoma', 'adenoma', 'osteoblastoma', 'hamartoma', 'mela¬noma', 'glaucoma', 'enchondroma', 'meyloma', 'rhabdomyoma', 'teratocarcinoma', 'glomangiopericytoma', '211aadenocarcinoma', 'microcarcinoma', 'ganglioglioma', 'choriocarcinoma', 'hidradenoma', 'neurilemoma', 'oligodendroglioma', 'melanoma', 'schwannoma', 'epithelioma', 'osteoma', 'neuroblastoma', 'humanpapilloma', 'microadenoma', 'keratocanthoma', 'myofibroma', 'histocytoma', 'cyneuroblastoma', 'histiocytoma', 'ganglioneuroblastoma', 'foimelanoma', 'dermatofibrosarcoma', 'methodsosteosarcoma', 'hygroma', 'fibroblastoma', 'mucosalmelanoma', 'neuroblatoma', 'oklahoma', 'oncocytoma', 'arcinoma', 'keratoadenoma', 'medulloblastoma', 'chondrosarcoma', 'hepatoma', 'anoma', 'imelanoma', 'neuroepithelioma',

<b>Words that end with -ia(s). NEED A GOOD WAY TO REMOVE COUNTRY NAMES!</b>

In [21]:
%%time
pattern2 = r"[A-Za-z]+[sm]?ias?$"
txt_with_sias = [word.lower() for word in word_tokens \
                 if re.search(pattern2, word)\
                 if word not in commoners \
                 if word.lower() not in ias_to_remove]
txt_with_sias = [re.sub(r's$', '', word) for word in txt_with_sias]
txt_with_sias = [re.sub(r'ae', 'e', word) for word in txt_with_sias]
txt_with_sias = [re.sub(r'\W', '', word) for word in txt_with_sias]
print(set(txt_with_sias))

{'escherichia', 'albuminuria', 'hyperphoshatemia', 'salgia', 'diphtheria', 'galicia', 'osteochondrodysplasia', 'myelodysplasia', 'dyscrasia', 'kemia', 'paronychia', 'osteopenia', 'anaplasia', 'primaria', 'ommatidia', 'heterotopia', 'chothia', 'teleangiectasia', 'paraganglia', 'dysrhythmia', 'neoplasia', 'fria', 'sharifnia', 'emmetropia', 'sitia', 'dyslipidemia', 'glycosuria', 'hyperplasia', 'myasthenia', 'dysphagia', 'hyperglycemia', 'neutrophilia', 'hyperprolactinemia', 'ectasia', 'solcia', 'polymicrogyria', 'chromobacteria', 'rhizomelia', 'tachycardia', 'bia', 'achrondroplasia', 'fridericia', 'leucopenia', 'telangectasia', 'primordia', 'genecopoeia', 'substantia', 'bacteremia', 'microphtalmia', 'billeria', 'absentia', 'platybasia', 'asphyxia', 'glycemia', 'hemophilia', 'incontinentia', 'hemimelia', 'vaccinia', 'hyperinsulinemia', 'hyposplenia', 'carpenteria', 'hematuria', 'braccia', 'patisia', 'honoraria', 'hypoadiponectinemia', 'aia', 'cumbria', 'eosinophilia', 'assaymitochondria', 

In [11]:
ias_to_remove = {'bolognia', 'lithuania', 'northumbria', 'pan‑asia', 'westphalia', 'yugoslavia', 'mejia', 'damia', 'sylvia', 'bhatia', 
 'carpintenia', 'sisodia', 'bia', 'arabia', 'catalonia', 'mdia', 'cynthia', 'xia', 'victoria', 'tunisia', 'oceania', 
 'farugia', 'australasia', 'cassia', 'arteria', 'casaccia', 'youjia', 'walia', 'cornelia', 'sarkaria', 'savoia', 'rsalgia',
  'macia', 'algeria', 'rangatia', 'sequoia', 'anodontia', 'bonavia', 'sanitaria', 'center–sophia', 'mangia', 'rozovskaia', 
  'georgia', 'indústria', 'austria', 'sotoodehnia', '6australia', 'biovia', 'virginia', 'valsesia', 'gallia', 'valencia', 'perugia', 
  'silvia', 'pennsylvania', 'philadelphia', 'tartaglia', 'behzadnia', 'wikipedia', 'garcia', 'sonia', 'hafsia', 'tolia', 'pavia', 
  'slovakia', 'elia', 'mathia', 'l6czechosiovakia', 'iglesia', 'giaccia', 'belandia', 'baldia', 'materia', 'tulia', 'eurasia', 
  'santamaria', 'italia', 'academia', 'sushia', 'soravia', 'gloria', 'bagrodia', 'india', 'caria', 'ethiopia', 'ansonia', 'maria', 
  'mtartaglia', 'dahia', 'australia', 'eugenia', 'catania', 'luria', 'coria', 'titia', 'mattia', 'consortia', '9philadelphia', 
  'ghia', 'sardinia', 'gaia', 'capinteria', 'terapia', 'faria', 'colombia', 'farrugia', 'via', 'tria', 'nigeria', '1ia', 'emilia',
  'vallania', 'california', 'sophia', 'farma´cia', 'patricia', 'santarpia', 'rumania', 'cristália', 'bosottia', 'mongia', 'gradia', 
  '177030criteria', 'aria', 'scotia', 'slovenia', 'lucia', 'bavaria', 'griffonia', 'nia', 'alia', 'jia', 'tasmania', 'scalia', 'natalia',
  'maia', 'tobia', 'garcdia', 'estonia', 'columbia', 'candia', 'criteria', 'monteia', 'attia', 'provia', 'sanabria', 'matthia', 'sibilia',
  'iberia', 'sebia', 'beria', 'ilia', 'tapia'}

<b>Words that end with -sis and -tis</b>

In [18]:
%%time
pattern3 = r"[A-Za-z]+i[sc]$"
txt_with_sis_tis = [word.lower() for word in word_tokens \
                    if re.search(pattern3, word) \
                    if word not in commoners \
                    if word.lower() not in ics_to_remove]
print(set(txt_with_sis_tis))

{'epigenesis', 'atrophic', 'kountourakis', 'osteolytic', 'prototypic', 'thrombosis', 'mll–af9–specific', 'ristic', 'radiographic', 'pallis', 'dysgenesis', 'oncologic', 'congenic', 'strouboulis', 'acetylneuraminic', 'noncrystallographic', 'agonistic', 'bourouis', 'psoriasis', 'antagonistic', 'organic', 'idiopathic', 'luikenhuis', 'heteromeric', 'nonpleomorphic', 'callelic', 'toxicogenetic', 'dnaanalysis', 'rac‐specific', 'kottaridis', 'hypophysitis', 'semilogarithmic', 'rhinitis', 'narcotic', 'mediastinic', 'mitotic', 'oligodeoxythymidylic', 'ectropic', 'macrocytic', 'sideroblastic', 'astrogenetic', 'mycophenolic', 'nonisotopic', 'mesoblastic', 'antiepileptic', 'nonhematopoietic', 'antic', 'tetrasomic', 'polyhedrosis', 'stoichiometric', 'histiocytosis', 'anis', 'fishanalysis', 'inotropic', 'asomatic', 'methodsgenomic', 'clinicopathologic', 'prosthetic', 'heterotopic', 'nic', 'epidemic', 'transgenomic', 'astrocytic', 'ysis', 'gametic', 'phase–specific', 'electrophillic', 'atherosclerotic

In [16]:
ics_to_remove = {'antarctic', 'electric', 'specific', 'nonspecific', 'this', 'volumetric', 'thesis', 'terrific', 'horrific', 'graphic',
                 'mechanic', 'mechanistic', 'basis', 'analysis'}

<b>Words that end with -cyte(s)</b>

In [25]:
%%time
pattern4 = r"[A-Za-z]+cytes?$"
txt_with_cytes = [word.lower() for word in word_tokens if re.search(pattern4, word)]
txt_with_cytes = [re.sub(r's$', '', word) for word in txt_with_cytes]
print(set(txt_with_cytes))

{'promonocyte', 'epitheliocyte', 'oligodrendrocyte', 'kocyte', 'nonkeratinocyte', 'thyrocyte', 'centrocyte', 'oncocyte', 'innocyte', 'poikilocyte', 'nevocyte', 'siderocyte', 'histiocyte', 'myelocyte', 'promyelocyte', 'adipocyte', 'plasmocyte', 'leucocyte', 'pneumocyte', 'nonlymphocyte', 'minigemistocyte', 'enterocyte', 'reticulocyte', 'podocyte', 'neurocyte', 'oocyte', 'osteocyte', 'cardiomiocyte', 'chondrocyte', 'granulocyte–monocyte', 'spleenocyte', 'erythromegakaryocyte', 'astrocyte', 'gonocyte', 'phocyte', 'incucyte', 'hepatocyte', 'pericyte', 'preadipocyte', 'splenocyte', 'leukocyte', 'cardiomyocyte', 'erythrocyte', 'monocyte', 'prolymphocyte', 'micromegakaryocyte', 'cholangiocyte', 'keratinocyte', 'synoviocyte', 'andlymphocyte', 'hemocyte', 'pneumonocyte', 'sebocyte', 'phagocyte', 'immunocyte', 'thrombocyte', 'metamyelocyte', 'colonocyte', 'anti–hepatocyte', 'myocyte', 'oligodendrocyte', 'incyte', 'megakarocyte', 'melanocyte', 'unit–granulocyte', 'granulocyte', 'megakaryocyte', '

<b>Words that end with -nib or -mab</b>

In [99]:
%%time
pattern5 = r"\w+nib$|\w+mab$"
txt_with_nib = [word.lower() for word in word_tokens if re.search(pattern5, word)]
print(set(txt_with_nib))

{'brentuximab', 'cellswasimatinib', 'dasitinib', 'preimatinib', 'gentuzumab', 'suntinib', '2—entrectinib', 'fostamatinib', 'tinib', '—gefitinib', 'crizotiinib', 'sulfatinib', 'lenvatinib', 'canertinib', 'post–erlotinib', '1—entrectinib', 'lucitanib', '+pertuzumab', 'motesanib', '+cobimetinib', 'geftinib', 'merestinib', 'trametinib', 'entrecitnib', 'imitinib', 'altiratinib', 'tremelimumab', '+imatinib', 'nintedanib', 'tipifarnib', 'neratinib+trastuzumab', 'egfr–afatinib', 'selumetinib', 'tivantinib', 'ceritinib', 'sorafenib+sunitinib', '+erlotinib', 'lumretuzumab', 'fedratinib', 'cobimetinib', 'tivatinib', 'tasocitinib', 'crenolanib', 'vatalanib', 'duligotuzumab', 'lestaurtinib', 'withimatinib', 'arms—erlotinib', 'dovitinib', 'toimatinib', 'trastuzumab+neratinib', 'postimatinib', 'desminib', 'imatinib', 'mgimatinib', 'rutuximab', 'dabrafenib–trametinib', 'pertuzumab', 'dacomitnib', 'alemtuzumab', 'crioztinib', 'erlotinib', 'g1202rcrizotinib', 'seluteminib', 'withdasatinib', 'tmab', 'ent

<b>Words that end with -ase(s)</b>

In [26]:
%%time
pattern8= r"\w+ases?$"
txt_with_ases = [word.lower() for word in word_tokens if re.search(pattern8, word)]
txt_with_ases = [re.sub(r's$', '', word) for word in txt_with_ases]
print(set(txt_with_ases))

{'γ‐secretase', 'glycosyltransferase', 'telophase', 'acetylglucosaminetransferase', 'asparaginase', 'integrase', 'sds–proteinase', '25–base', 'tryptase', 'lyase', 'acetylglucosaminidase', 'sialyltransferase', 'reductase', 'cyclooxygenase', 'single–base', 'optimase', '370base', '9–base', 'rho‐gtpase', 'hydroxlase', 'g1phase', 'retrotranscriptase', 'topoisomerase', 'aphase', 'fumarase', 'pseudouridylase', '1case', 'endoribonuclease', 'cyclase', 'mataphase', 'encase', 'mesophase', 'aconitase', 'ikinase', 'multikinase', 'taqpolymerase', 'ar–luciferase', 'yanase', 'translocase', 'anincrease', 'geranylgeranylprenyltransferase', 'ferrochelatase', 'transcarbamoylase', 'transketolase', 'endoglycosidase', 'phosphoglycerolkinase', 'antiperoxidase', '5′phosphatase', 'glycolsylase', 'amylase', 'bensonase', 'mutationspolymerase', 'brother—case', 'phosphokinase', 'allcase', 'hydroxlylase', 'transciptase', 'vbase', 'tease', 'database', 'thedatabase', 'anhydrase', 'desaturase', 'myeloperoxidase', 'exon

<b>Words that end with -blast(s)</b>

In [27]:
%%time
pattern9= r"\w+blasts?$"
txt_with_blasts = [word.lower() for word in word_tokens if re.search(pattern9, word)]
txt_with_blasts = [re.sub(r'\W', '', word) for word in txt_with_blasts]
txt_with_blasts = [re.sub(r's$', '', word) for word in txt_with_blasts]
print(set(txt_with_blasts))

{'enteroblast', 'erythroblast', 'preosteoblast', 'neuroblast', 'myeloblast', 'spongiotrophoblast', 'megablast', 'p53y220sfibroblast', 'mutantfibroblast', 'monoblast', 'sideroblast', 'conditionsfibroblast', 'myoblast', 'retinoblast', 'fractionsfibroblast', 'melanoblast', 'rhabdomyoblast', 'thesefibroblast', 'igblast', 'epiblast', 'hepatoblast', 'odontoblast', 'fibroblastmyofibroblast', 'myelomonoblast', 'osteoblast', 'lymphoblast', 'murinefibroblast', 'chondroblast', 'lipoblast', 'haemangioblast', 'myofibroblast', 'immunoblast', 'ﬁbroblast', 'broblast', 'fibroblast', 'centroblast', 'plasmablast', 'proerythroblast', 'trophoblast', 'differentfibroblast', 'cytotrophoblast'}
CPU times: user 16.3 s, sys: 977 µs, total: 16.3 s
Wall time: 16.3 s


<b>Words that stats with epiderm-, endothel-, onco-, hepato-, h(a)emato-</b>

In [23]:
%%time
prefix_pattern1= r"^epiderm|^endothel|^onco|^hepato|^ha?emato|^osteo"
txt_with_prefixes1 = [word.lower() for word in word_tokens if re.search(prefix_pattern1, word)]
print(set(txt_with_prefixes1))

{'onco', 'oncogenek', 'haematology', 'oncogeneic', 'osteolytic', 'oncogene', 'oncogenefor', 'oncomutants', 'osteodysplastic', 'osteoid', 'hepatocellular24', 'endothelial', 'hepatocarcinogenesis—ex', 'osteochondrodysplasia', 'osteopaths', 'oncogenesis,34,40,43,46,47', 'oncologic', 'endothelia', 'osteoradionecrosis', 'oncoantigens149', 'oncocytic', 'osteosarcoma134', 'osteoblastoma', 'osteopontin', 'osteopenia', 'hepatosplenomegaly', 'hepatoslenomegaly', 'epidermis', 'oncologist', 'hepato', 'oncologically', 'oncogenes2', 'osteoprogenitors', 'hepatocyte', 'oncosuppressor', 'hepatoblast', 'hematopoietic24', 'oncogenic', 'hepatomegaly', 'osteoblast', 'osteoclastogenesis', 'hematocytometer', 'oncosystem', 'oncosnp', 'oncomine', 'osteonectin', 'osteotomy', 'oncofetal', 'oncogene29,30', 'hematoxilyn', 'oncocytoma', 'oncosuppressive', 'hepatoma', 'osteogenesis', 'osteocytes', 'oncogens', 'oncogeni', 'osteoporotic', 'osteomas', 'oncogenethe', 'osteoporosis', 'hepatocellular', 'osteoclasts', 'onc

In [103]:
%%time
prefix_pattern2 = r"^glyco|^phospho|^ubiquityl|^ubiquitinat|^acetyl|^methyl|^deamin|^oxydat|^hyper|^hypo"
txt_with_prefixes2 = [word for word in word_tokens \
                      if re.search(prefix_pattern2, word) \
                      if word not in commoners]
print(set(txt_with_prefixes2))

{'phosphoEGFR', 'hypomineralization', 'phosphorylate', 'methylsulfonylethylamino', 'phosphorylation4', 'hyporexia', 'hypophysial', 'hypochromic', 'hypochondrium', 'phosphoFLT3', 'phosphoin', 'hypomethylator', 'phosphoepitopes', 'hypodiploid', 'phosphorylation23', 'phospholipid', 'hypocholesterolemia', 'phosphomimicry', 'hypothyroidism,19', 'glycosyl', 'methylazoxymethanol', 'phospholipase', 'glycoslylated', 'methylhisFIGURE', 'phosphotidlyserine', 'hypokalemia', 'phosphoresidue', 'phospho–ERK1', 'acetylases', 'phosphothioate', 'hypopigmentation', 'methylating', 'phospho‐', 'methyldeoxycytidine', 'methylationsensitive', 'methylphenyl', 'methylum', 'phosphory\xad', 'hyposecretory', 'phosphoThr', 'phosphorylation8', 'methyltransferases', 'ubiquitylate', 'phosphorylation86', 'hypothalamic', 'methylphenazinium', 'methylation', 'phosphonacetyl', 'phosphotyrosinespecific', 'phosphorelated', 'methylation35', 'acetylation17', 'phosphopeptides', 'phosphoinoistides', 'phosphor', 'ubiquitylation15