In [1]:
import pandas as pd
import re
import numpy as np
import pickle
import gc

In [2]:
from nltk.corpus import stopwords
import spacy

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [4]:
with open('nlp16/df4ngrams.pickle', 'rb') as infile:
    df = pickle.load(infile)

In [5]:
df.head()

Unnamed: 0,silac,ms1_label_free,spectrum_counting,tmt,itraq,label_free,sample_protocol,data_protocol,description
0,0,0,0,0,0,0,"breast cancer tissue lysate, reduction, alkyla...",proteomic datum analysis : proteome discoverer...,current prognostic factor be insufficient for ...
1,0,0,0,0,0,0,protein extraction from catheter biofilm small...,mass spectrometric analyse the tryptic digest ...,long term-catheterization lead inevitably to a...
2,0,0,0,0,0,0,generation of mdia2-based immunocomplexes and ...,ms data analysis peak list be generate from ...,mdia2 be an auto-inhibit formin influence acti...
3,0,0,0,0,0,0,naïve cd4 + t cell be isolate use magnetic bea...,ms datum be process use proteome discoverer 1....,"technological advance in genomic, epigenomic, ..."
4,0,0,0,0,0,0,cell be culture for 24 hour in serum free dmem...,raw datum be process use maxquant v1.5 and per...,the project profile the expression pattern in ...


In [6]:
text_features = df[['sample_protocol', 'data_protocol', 'description']]
labels = df[[col for col in df.columns if col not in text_features.columns]]

In [7]:
text_features.head(2)

Unnamed: 0,sample_protocol,data_protocol,description
0,"breast cancer tissue lysate, reduction, alkyla...",proteomic datum analysis : proteome discoverer...,current prognostic factor be insufficient for ...
1,protein extraction from catheter biofilm small...,mass spectrometric analyse the tryptic digest ...,long term-catheterization lead inevitably to a...


In [8]:
labels.head(2)

Unnamed: 0,silac,ms1_label_free,spectrum_counting,tmt,itraq,label_free
0,0,0,0,0,0,0
1,0,0,0,0,0,0


#### Transform into Count and Tf-Idf vector spaces 1: sample_protocol 

In [9]:
count_vectorizer = CountVectorizer(analyzer='word',  strip_accents='ascii', ngram_range=(1, 3), stop_words='english')
tfidf_vectorizer = TfidfVectorizer(analyzer='word',  strip_accents='ascii', ngram_range=(1, 3), stop_words='english')

In [10]:
%%time
count_sample = count_vectorizer.fit_transform(np.array(text_features['sample_protocol']))
tfidf_sample = tfidf_vectorizer.fit_transform(np.array(text_features['sample_protocol']))

CPU times: user 4.08 s, sys: 65.3 ms, total: 4.15 s
Wall time: 4.16 s


In [11]:
count_sample.toarray().shape, tfidf_sample.toarray().shape

((2387, 436721), (2387, 436721))

In [12]:
def vectorize_corpus(corpus, mode='count'):
    # Vectorize using CountVectorizer or Tf-IdfVectorizer
    if mode == 'tfidf':
        vectorizer_mode = 'TfidfVectorizer'
        vectorizer = TfidfVectorizer(analyzer='word',  strip_accents='ascii', ngram_range=(1, 3), stop_words='english')
    else:
        vectorizer_mode = 'CountVectorizer'
        vectorizer = CountVectorizer(analyzer='word',  strip_accents='ascii', ngram_range=(1, 3), stop_words='english')
    print("Transforming corpus with %s ..." % vectorizer_mode)
    vectorized = vectorizer.fit_transform(corpus)
    
    # Create an array
    df = pd.DataFrame(vectorized.toarray(), columns=vectorizer.get_feature_names())
    print("Initial shape after transformation:", df.shape)
    
    # Remove columns that have digits
    print("Removing columns with digits ...")
    select_cols = [col for col in df.columns if not re.match(r'.*\d+.*', col)]
    df = df[select_cols]
    
    print("Shape after removing digit columns:", df.shape)
    
    return df

In [13]:
%%time
df = vectorize_corpus(text_features['sample_protocol'], mode='count')

Transforming corpus with CountVectorizer ...
Initial shape after transformation: (2387, 436721)
Removing columns with digits ...
Shape after removing digit columns: (2387, 292702)
CPU times: user 4.44 s, sys: 1.93 s, total: 6.37 s
Wall time: 6.38 s


In [14]:
%%time
df = vectorize_corpus(text_features['sample_protocol'], mode='tfidf')

Transforming corpus with TfidfVectorizer ...
Initial shape after transformation: (2387, 436721)
Removing columns with digits ...
Shape after removing digit columns: (2387, 292702)
CPU times: user 4.48 s, sys: 2.38 s, total: 6.86 s
Wall time: 6.87 s


In [15]:
df = None
gc.collect()

374

### CountVectorizer

In [18]:
sample_df = vectorize_corpus(text_features['sample_protocol'], mode='count')
df = pd.concat([sample_df, labels], axis=1)
df.to_csv('nlp16/ngram_sample_df.csv', index=False)
    
sample_df = None
gc.collect()

Transforming corpus with CountVectorizer ...
Initial shape after transformation: (2387, 436721)
Removing columns with digits ...
Shape after removing digit columns: (2387, 292702)


KeyboardInterrupt: 

In [None]:
data_df = vectorize_corpus(text_features['data_protocol'], mode='count')
df = pd.concat([data_df, labels], axis=1)
df.to_csv('nlp16/ngram_data_df.csv', index=False)
    
data_df = None
gc.collect()

In [None]:
descript_df = vectorize_corpus(text_features['description'], mode='count')
df = pd.concat([descript_df, labels], axis=1)
df.to_csv('nlp16/ngram_descriptive_df.csv', index=False)

    
descript_df = None
gc.collect()

In [None]:
df = None
gc.collect()