In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import spacy



In [2]:
#default ngram_range=(1,1); bag of words
vec = CountVectorizer()
text = ["Dennis is drumming at the wedding"]
vec.fit(text)
vec.vocabulary_

{'dennis': 1, 'is': 3, 'drumming': 2, 'at': 0, 'the': 4, 'wedding': 5}

In [3]:
#using bigrams
vec = CountVectorizer(ngram_range=(2,2))
text = ["Dennis is drumming at the wedding"]
vec.fit(text)
vec.vocabulary_

{'dennis is': 1,
 'is drumming': 3,
 'drumming at': 2,
 'at the': 0,
 'the wedding': 4}

In [4]:
#combining bag of words(unigram) with bigram
vec = CountVectorizer(ngram_range=(1,2))
text = ["Dennis is drumming at the wedding"]
vec.fit(text)
vec.vocabulary_

{'dennis': 2,
 'is': 6,
 'drumming': 4,
 'at': 0,
 'the': 8,
 'wedding': 10,
 'dennis is': 3,
 'is drumming': 7,
 'drumming at': 5,
 'at the': 1,
 'the wedding': 9}

In [5]:
#combining bag of words(unigram) with trigram
vec = CountVectorizer(ngram_range=(1,3))
text = ["Dennis is drumming at the wedding"]
vec.fit(text)
vec.vocabulary_

{'dennis': 3,
 'is': 9,
 'drumming': 6,
 'at': 0,
 'the': 12,
 'wedding': 14,
 'dennis is': 4,
 'is drumming': 10,
 'drumming at': 7,
 'at the': 1,
 'the wedding': 13,
 'dennis is drumming': 5,
 'is drumming at': 11,
 'drumming at the': 8,
 'at the wedding': 2}

In [6]:
doc = ['David drank wine',
      'Mike is young',
      'Phillip is drinking wine']

In [7]:
#remove stop words and apply lemmatization
nlp = spacy.load('en_core_web_sm')
def preprocess(text):
    non_stopwords = []
    doc = nlp(text)
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        non_stopwords.append(token.lemma_)
    return " ".join(non_stopwords)
    

In [8]:
preprocess('David drank wine')

'David drink wine'

In [9]:
preprocess('Mike is young')

'Mike young'

In [10]:
preprocess('Phillip is drinking wine')

'phillip drink wine'

In [11]:
#apply preprocessing to the entire list
doc_processed = [preprocess(text) for text in doc]
doc_processed

['David drink wine', 'Mike young', 'phillip drink wine']

In [12]:
#apply bag of n_grams (bigram) to this doc
v =  CountVectorizer(ngram_range=(1,2))
v.fit(doc_processed)
v.vocabulary_

{'david': 0,
 'drink': 2,
 'wine': 8,
 'david drink': 1,
 'drink wine': 3,
 'mike': 4,
 'young': 9,
 'mike young': 5,
 'phillip': 6,
 'phillip drink': 7}

In [13]:
#convert text to vector using n-gram model
v.transform(['David drink wine'])

<1x10 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [14]:
v.transform(['David drink wine']).toarray()

array([[1, 1, 1, 1, 0, 0, 0, 0, 1, 0]], dtype=int64)

In [15]:
doc_vector = [v.transform([text]).toarray() for text in doc_processed]
doc_vector

[array([[1, 1, 1, 1, 0, 0, 0, 0, 1, 0]], dtype=int64),
 array([[0, 0, 0, 0, 1, 1, 0, 0, 0, 1]], dtype=int64),
 array([[0, 0, 1, 1, 0, 0, 1, 1, 1, 0]], dtype=int64)]