In [1]:
import nltk
import string
from contractions import CONTRACTION_MAP
import re 
from nltk.stem import WordNetLemmatizer

# Text Normalization


In [2]:
# Expanding contractions
# Text standardization through lemmatization
# Removing special characters and symbols
# Removing stopwords

In [3]:
stopword_list = nltk.corpus.stopwords.words('english')
wnl = WordNetLemmatizer()

In [4]:
# function tokenizes and removes any extraneous whitespace from the tokens

In [5]:
def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return tokens

In [6]:
# function for expanding contractions

In [7]:
def expand_contractions(text,contraction_mapping):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags = re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
    expanded_text = contractions_pattern.sub(expand_match,text)
    expanded_text = re.sub("'","",expanded_text)
    return expanded_text

In [8]:
# function for lemmatization

In [9]:
from pattern.en import tag
from nltk.corpus import wordnet as wn

In [10]:
def pos_tag_text(text):
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
        tagged_text = tag(text)
        tagged_lower_text = [(word.lower(),penn_to_wn_tags(pos_tag)) for word, pos_tag in tagged_text]
        return tagged_lower_text
    def lemmatize_text(text):
        pos_tagged_text = pos_tag_text(text)
        lemmatized_tokens = [wnl.lemmatize(word,pos_tag) if pos_tag else word for word,pos_tag in pos_tagged_text]
        lemmatized_text = ''.join(lemmatized_tokens)
        return lemmatized_text
    

In [11]:
# function to remove special symbols and characters

In [12]:
def remove_special_characters(text):
    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None,[pattern.sub('',token) for token in tokens])
    filtered_text = ''.join(filtered_tokens)
    return filtered_text

In [13]:
# function to remove stopwords

In [14]:
def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ''.join(filtered_tokens)
    return filtered_text

In [15]:
# text normalization pipeline

In [16]:
def normalize_corpus(corpus,tokenize=False):
    normalized_corpus = []
    for text in corpus:
        text = expand_contractions(text,CONTRACTION_MAP)
        text = lemmatize_text(text)
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        normalized_corpus.append(text)
        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)
        return normalized_corpus

# Feature Extraction

# Bag of Words

In [17]:
CORPUS = [
'the sky is blue',
'sky is blue and sky is beautiful',
'the beautiful sky is so blue',
'i love blue cheese'
]
new_doc = ['loving this blue sky today']

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
def bow_extractor(corpus, ngram_range=(1,1)):
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [19]:
bow_vectorizer, bow_features = bow_extractor(CORPUS)
features = bow_features.todense()
print( features)

[[0 0 1 0 1 0 1 0 1]
 [1 1 1 0 2 0 2 0 0]
 [0 1 1 0 1 0 1 1 1]
 [0 0 1 1 0 1 0 0 0]]


In [20]:
new_doc_features = bow_vectorizer.transform(new_doc)
new_doc_features = new_doc_features.todense()
print (new_doc_features)

[[0 0 1 0 0 0 1 0 0]]


In [21]:
feature_names = bow_vectorizer.get_feature_names()
print( feature_names)

['and', 'beautiful', 'blue', 'cheese', 'is', 'love', 'sky', 'so', 'the']


In [22]:
import pandas as pd
def display_features(features, feature_names):
    df = pd.DataFrame(data=features,columns=feature_names)
    print (df)

In [23]:
display_features(features, feature_names)

   and  beautiful  blue  cheese  is  love  sky  so  the
0    0          0     1       0   1     0    1   0    1
1    1          1     1       0   2     0    2   0    0
2    0          1     1       0   1     0    1   1    1
3    0          0     1       1   0     1    0   0    0


In [24]:
display_features(new_doc_features, feature_names)

   and  beautiful  blue  cheese  is  love  sky  so  the
0    0          0     1       0   0     0    1   0    0


# TF-IDF

In [25]:
from sklearn.feature_extraction.text import TfidfTransformer
def tfidf_transformer(bow_matrix):
    transformer = TfidfTransformer(norm='l2',
                                   smooth_idf=True,
                                   use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix

In [26]:
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
feature_names = bow_vectorizer.get_feature_names()
tfidf_trans, tdidf_features = tfidf_transformer(bow_features)
features = np.round(tdidf_features.todense(), 2)
display_features(features, feature_names)

    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.40    0.00  0.49  0.00  0.49  0.00  0.60
1  0.44       0.35  0.23    0.00  0.56  0.00  0.56  0.00  0.00
2  0.00       0.43  0.29    0.00  0.35  0.00  0.35  0.55  0.43
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00


In [27]:
nd_tfidf = tfidf_trans.transform(new_doc_features)
nd_features = np.round(nd_tfidf.todense(), 2)
display_features(nd_features, feature_names)

   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.63     0.0  0.0   0.0  0.77  0.0  0.0


In [28]:
# generic function to directly compute the tfidf-based feature vectors

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_extractor(corpus, ngram_range=(1,1)):
    vectorizer = TfidfVectorizer(min_df=1,
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [30]:
tfidf_vectorizer, tdidf_features = tfidf_extractor(CORPUS)
display_features(np.round(tdidf_features.todense(), 2), feature_names)

    and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.40    0.00  0.49  0.00  0.49  0.00  0.60
1  0.44       0.35  0.23    0.00  0.56  0.00  0.56  0.00  0.00
2  0.00       0.43  0.29    0.00  0.35  0.00  0.35  0.55  0.43
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00


In [31]:
nd_tfidf = tfidf_vectorizer.transform(new_doc)
display_features(np.round(nd_tfidf.todense(), 2), feature_names) 

   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.63     0.0  0.0   0.0  0.77  0.0  0.0


# Advanced Word Vectorization Models

In [32]:
import gensim

In [33]:
TOKENIZED_CORPUS = [nltk.word_tokenize(sentence)
                    for sentence in CORPUS]
tokenized_new_doc = [nltk.word_tokenize(sentence)
                    for sentence in new_doc]      
model = gensim.models.Word2Vec(TOKENIZED_CORPUS, size=10, window=10,
                               min_count=2, sample=1e-3)

  "C extension not loaded, training will be slow. "


# Averaged Word Vectors

In [34]:
print( model['sky'])

[ 0.04257333 -0.00983948 -0.00890173  0.03999743  0.00382699 -0.01049888
  0.00469211 -0.01632833 -0.01661829  0.00136855]


  """Entry point for launching an IPython kernel.


In [35]:
print( model['blue'])

[-0.03865465  0.02325692 -0.04004833  0.01042387  0.04479002  0.04707708
 -0.00243584  0.02990141 -0.00487392 -0.01143104]


  """Entry point for launching an IPython kernel.


In [36]:
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    for word in words:
        if word in vocabulary:
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector

In [37]:
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [38]:
avg_word_vec_features = averaged_word_vectorizer(corpus=TOKENIZED_CORPUS,
                                               model=model,
                                             num_features=10)
print(np.round(avg_word_vec_features, 3))

[[ 0.009  0.008 -0.011  0.026  0.009  0.009 -0.005  0.014 -0.01  -0.014]
 [ 0.015  0.002  0.005  0.011  0.002  0.013 -0.011  0.002 -0.011 -0.003]
 [ 0.002 -0.    -0.004  0.013  0.005 -0.    -0.014  0.005 -0.011 -0.011]
 [-0.039  0.023 -0.04   0.01   0.045  0.047 -0.002  0.03  -0.005 -0.011]]


  import sys


In [39]:
nd_avg_word_vec_features = averaged_word_vectorizer(corpus=tokenized_new_doc,model=model,num_features=10)
print( np.round(nd_avg_word_vec_features, 3))

[[ 0.002  0.007 -0.024  0.025  0.024  0.018  0.001  0.007 -0.011 -0.005]]


  import sys


# TF-IDF Weighted Averaged Word Vectors