In [9]:
import os
import pickle
import pandas as pd
import spacy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import scipy
from spacy.vectors import Vectors
from spacy.strings import StringStore
from scipy.cluster.hierarchy import dendrogram, linkage
from wordcloud import WordCloud
from nltk.corpus import stopwords

from six import iteritems
from gensim import corpora
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from retrieve_data import get_descriptive_fields

%matplotlib inline

In [10]:
os.getcwd()

'/home/ryohayama/Python/b13logy/nlp'

In [11]:
# Retrive with sql query
df = get_descriptive_fields(num_rows=6000, db_file='../database_files/b13logy.ini')

In [12]:
df.head(2)

Unnamed: 0,dataset_id,sample_protocol,data_protocol,description,instruments,exp_types,quant_methods,labhead_fullname
0,PXD000001,Not available,Two extra files have been added post-publicati...,Expected reporter ion ratios: Erwinia peptides...,"LTQ Orbitrap Velos, instrument model",Bottom-up proteomics,,
1,PXD000002,Not available,Not available,Human saliva samples from a 36 year old male w...,"instrument model, LTQ Velos",Bottom-up proteomics,,


In [13]:
len(df)

5452

In [14]:
# Get descriptive text
sample_proto = df['sample_protocol'].dropna()
sample_proto = sample_proto[sample_proto != 'Not available']

data_proto = df['data_protocol'].dropna()
data_proto = data_proto[data_proto != 'Not available']

descrip = df['description'].dropna()

corpus = list(sample_proto) + list(data_proto) + list(descrip)

In [15]:
corpus[:5]

['The crude membranes from 5 P56-P70 Glun1TAP/TAP mouse forebrains were re-suspended in 12.5\u2009ml buffer H and extracted with 12.5\u2009ml 2% deoxycholate, 100\u2009mM NaCl, 50\u2009mM Tris.Cl pH8 for 1\u2009h at 6\u2009°C. Total extract was centrifuged at 120,000g. for 40\u2009min at 8\u2009°C. Conditions for immuno-capture, wash and peptide-antigen exchange elution were screened using a high-throughput purification robot (MAGic sample processor, Invitrogen). For 25\u2009ml Glun1TAP/TAP extract supernatant, 80\u2009μg mouse Flag antibody was coupled to 30\u2009mg (500\u2009μl) protein G magnetic beads (Invitrogen). Receptor was captured from extract supernatant for 2\u2009h. The beads were washed three times with 5\u2009ml wash buffer (0.37% w/v sodium deoxycholate, 0.05\u2009mg.ml−1 lipids (1:1:3 POPC:POPE:POG), 150\u2009mM NaCl, 50\u2009mM Tris.Cl pH8). Flag captured complexes were eluted with 600\u2009μl wash buffer supplemented with 0.2\u2009mg.ml−1 Flag peptide for 2\u2009h at

In [16]:
nlp = spacy.load('en_core_web_lg')

In [17]:
def lemmatize_text(text):
    doc = nlp(text)
    lemm_text = [
            token.lemma_.lower() for token in doc 
            if token.text != '\u2009'
            if not token.is_stop
            if not token.is_punct
            if not token.is_bracket
            if not token.is_quote
            if token.pos_ != 'PRON'
            if token.tag_ != 'BES'
            if token.tag_ != 'IN'
            if token.tag_ !='HVS'
            if token.tag_ !='PDT'
            if token.tag_ !='TO'
            if token.tag_ !='UH'
            
    ]
    return lemm_text

def lemmatize_corpus(corpus):
    lemm_corpus = []
    for text in corpus:
        lemm_text = lemmatize_text(text)
        lemm_corpus.append(lemm_text)
    return lemm_corpus

In [18]:
%%time
lemm_corpus = lemmatize_corpus(corpus)

CPU times: user 49min 58s, sys: 47min 56s, total: 1h 37min 54s
Wall time: 12min 32s


In [19]:
corpus[:2]

['The crude membranes from 5 P56-P70 Glun1TAP/TAP mouse forebrains were re-suspended in 12.5\u2009ml buffer H and extracted with 12.5\u2009ml 2% deoxycholate, 100\u2009mM NaCl, 50\u2009mM Tris.Cl pH8 for 1\u2009h at 6\u2009°C. Total extract was centrifuged at 120,000g. for 40\u2009min at 8\u2009°C. Conditions for immuno-capture, wash and peptide-antigen exchange elution were screened using a high-throughput purification robot (MAGic sample processor, Invitrogen). For 25\u2009ml Glun1TAP/TAP extract supernatant, 80\u2009μg mouse Flag antibody was coupled to 30\u2009mg (500\u2009μl) protein G magnetic beads (Invitrogen). Receptor was captured from extract supernatant for 2\u2009h. The beads were washed three times with 5\u2009ml wash buffer (0.37% w/v sodium deoxycholate, 0.05\u2009mg.ml−1 lipids (1:1:3 POPC:POPE:POG), 150\u2009mM NaCl, 50\u2009mM Tris.Cl pH8). Flag captured complexes were eluted with 600\u2009μl wash buffer supplemented with 0.2\u2009mg.ml−1 Flag peptide for 2\u2009h at

In [20]:
lemm_corpus[:2]

[['the',
  'crude',
  'membrane',
  '5',
  'p56-p70',
  'glun1tap',
  'tap',
  'mouse',
  'forebrain',
  'be',
  're',
  'suspend',
  '12.5',
  'ml',
  'buffer',
  'h',
  'and',
  'extract',
  '12.5',
  'ml',
  '2',
  'deoxycholate',
  '100',
  'mm',
  'nacl',
  '50',
  'mm',
  'tris',
  'cl',
  'ph8',
  '1',
  'h',
  '6',
  '°',
  'c',
  'total',
  'extract',
  'be',
  'centrifuge',
  '120,000',
  'g',
  '40',
  'min',
  '8',
  '°',
  'c',
  'condition',
  'immuno',
  'capture',
  'wash',
  'and',
  'peptide',
  'antigen',
  'exchange',
  'elution',
  'be',
  'screen',
  'use',
  'a',
  'high',
  'throughput',
  'purification',
  'robot',
  'magic',
  'sample',
  'processor',
  'invitrogen',
  '25',
  'glun1tap',
  'tap',
  'extract',
  'supernatant',
  '80',
  'μg',
  'mouse',
  'flag',
  'antibody',
  'be',
  'couple',
  '30',
  'mg',
  '500',
  'protein',
  'g',
  'magnetic',
  'bead',
  'invitrogen',
  'receptor',
  'be',
  'capture',
  'extract',
  'supernatant',
  '2',
  'h.',
 

In [21]:
outfile1 = open('nlp09_data/nlp09_original_corpus.pickle', 'wb')
pickle.dump(corpus, outfile1)
outfile1.close()

outfile2 = open('nlp09_data/nlp09_lemmatized_corpus.pickle', 'wb')
pickle.dump(lemm_corpus, outfile2)
outfile2.close()

In [14]:
# Create dictionary for the lemmatized_corpus
dictionary = corpora.Dictionary(lemm_corpus)

In [15]:
stop_words = stopwords.words('english')
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
stop_ids = [dictionary.token2id[stopword] for stopword in stop_words
                   if stopword in dictionary.token2id]

In [16]:
dictionary.filter_tokens(stop_ids + once_ids)
dictionary.compactify()

In [17]:
dictionary.save('serialized_data/nlp09_lemmatized_corpus.dict')

In [18]:
# Create BoW
bow_corpus = [dictionary.doc2bow(text) for text in lemm_corpus]

In [19]:
corpora.MmCorpus.serialize('serialized_data/nlp09_bow_corpus.mm', bow_corpus)

In [20]:
print(dictionary)

Dictionary(27077 unique tokens: ['0.05', '0.2', '1', '10', '100']...)
