Use the retrieving function that queries descriptive texts from the PX database (on postgres) as well as their categorical attributes. Then just take the descriptive text part, combine them, split them into sentences and store in a list. Then save the list as a pickle object.

In [1]:
import os
import pandas as pd
import spacy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import scipy
from spacy.vectors import Vectors
from spacy.strings import StringStore
from scipy.cluster.hierarchy import dendrogram, linkage
from wordcloud import WordCloud
from nltk.corpus import stopwords

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

from retrieve_data import get_descriptive_fields

%matplotlib inline

In [2]:
os.getcwd()

'/home/ryohayama/Python/b13logy/nlp'

In [3]:
# Retrive with sql query
df = get_descriptive_fields(num_rows=6000, db_file='../database_files/b13logy.ini')

In [4]:
df

Unnamed: 0,dataset_id,sample_protocol,data_protocol,description,instruments,exp_types,quant_methods,labhead_fullname
0,PXD000001,Not available,Two extra files have been added post-publicati...,Expected reporter ion ratios: Erwinia peptides...,"LTQ Orbitrap Velos, instrument model",Bottom-up proteomics,,
1,PXD000002,Not available,Not available,Human saliva samples from a 36 year old male w...,"instrument model, LTQ Velos",Bottom-up proteomics,,
2,PXD000003,,,,,,,
3,PXD000004,Not available,Not available,Prefrontal cortex tissue slices from a cogniti...,"instrument model, Q Exactive",Bottom-up proteomics,,
4,PXD000005,Not available,Not available,Proteomics on HCT116 cells. 3 samples wild-typ...,LTQ Orbitrap Velos,Bottom-up proteomics,,
5,PXD000006,Not available,Not available,"15-deoxy-delta12,14-prostaglandin J2 (15d-PGJ2...","LTQ Orbitrap, instrument model",Bottom-up proteomics,,
6,PXD000008,Not available,Not available,2.3 Cell wall and secretome preparation. C. ...,"instrument model, Q-Tof ultima",Bottom-up proteomics,,
7,PXD000009,Not available,Not available,Top down identification of proteins detected b...,LTQ Orbitrap,Bottom-up proteomics,,
8,PXD000011,The crude membranes from 5 P56-P70 Glun1TAP/TA...,"Data-dependent analysis was carried out, using...","TAP-GluN1 (840 kDa and 1.5 MDa), PSD95-TAP (1....","LTQ Orbitrap, instrument model",Bottom-up proteomics,,Seth Grant
9,PXD000012,Not available,Not available,Enlarged white adipose tissue (WAT) is a featu...,instrument model,Bottom-up proteomics,,


In [5]:
# Get descriptive text
sample_proto = df['sample_protocol'].dropna()
sample_proto = sample_proto[sample_proto != 'Not available']

data_proto = df['data_protocol'].dropna()
data_proto = data_proto[data_proto != 'Not available']

descrip = df['description'].dropna()

descriptive_text = list(sample_proto) + list(data_proto) + list(descrip)

In [7]:
descriptive_text[:5]

['The crude membranes from 5 P56-P70 Glun1TAP/TAP mouse forebrains were re-suspended in 12.5\u2009ml buffer H and extracted with 12.5\u2009ml 2% deoxycholate, 100\u2009mM NaCl, 50\u2009mM Tris.Cl pH8 for 1\u2009h at 6\u2009°C. Total extract was centrifuged at 120,000g. for 40\u2009min at 8\u2009°C. Conditions for immuno-capture, wash and peptide-antigen exchange elution were screened using a high-throughput purification robot (MAGic sample processor, Invitrogen). For 25\u2009ml Glun1TAP/TAP extract supernatant, 80\u2009μg mouse Flag antibody was coupled to 30\u2009mg (500\u2009μl) protein G magnetic beads (Invitrogen). Receptor was captured from extract supernatant for 2\u2009h. The beads were washed three times with 5\u2009ml wash buffer (0.37% w/v sodium deoxycholate, 0.05\u2009mg.ml−1 lipids (1:1:3 POPC:POPE:POG), 150\u2009mM NaCl, 50\u2009mM Tris.Cl pH8). Flag captured complexes were eluted with 600\u2009μl wash buffer supplemented with 0.2\u2009mg.ml−1 Flag peptide for 2\u2009h at

Use spacy and split into sentences

In [8]:
nlp = spacy.load('en_core_web_lg')

In [9]:
def process_descriptive_text(corpus):
    sentences = []
    for text in corpus:
        doc = nlp(text)
        s = [sent.text for sent in doc.sents]
        sentences.extend(s)
    return sentences

In [11]:
sentences = process_descriptive_text(descriptive_text)

In [19]:
sentences[:10]

['The crude membranes from 5 P56-P70 Glun1TAP/TAP mouse forebrains were re-suspended in 12.5\u2009ml buffer H and extracted with 12.5\u2009ml 2% deoxycholate, 100\u2009mM NaCl, 50\u2009mM Tris.',
 'Cl pH8 for 1\u2009h at 6\u2009°C.',
 'Total extract was centrifuged at 120,000g.',
 'for 40\u2009min at 8\u2009°C.',
 'Conditions for immuno-capture, wash and peptide-antigen exchange elution were screened using a high-throughput purification robot (MAGic sample processor, Invitrogen).',
 'For 25\u2009ml Glun1TAP/TAP extract supernatant, 80\u2009μg mouse Flag antibody was coupled to 30\u2009mg (500\u2009μl)',
 'protein G magnetic beads (Invitrogen).',
 'Receptor was captured from extract supernatant for 2\u2009h.',
 'The beads were washed three times with 5\u2009ml wash buffer (0.37% w/v sodium deoxycholate, 0.05\u2009mg.ml−1 lipids (1:1:3',
 'POPC:POPE:POG), 150\u2009mM NaCl, 50\u2009mM Tris.']

Save as pickle object

In [12]:
import pickle

In [18]:
outfile = open('descriptive_text.pickle', 'wb')
pickle.dump(sentences, outfile)
outfile.close()