https://medium.com/@surmenok/natural-language-pipeline-for-chatbots-897bda41482

In [138]:
from autocorrect import spell
import pandas as pd
from nltk.corpus import stopwords # Import the stop word list
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import re
import nltk.data
import logging
import numpy as np

In [111]:
stop = stopwords.words("english")
wordnet_lemmatizer = WordNetLemmatizer()
snowball_stemmer = SnowballStemmer("english")
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [71]:
def clean_text(raw_data):
    # 1. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", raw_data) 
    #
    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split() 
    #
    # 3. In Python, searching a set is much faster than searching
    #    a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))
    # 
    # 4. Remove stop words
    meaningful_words = [w for w in words if not w in stops]  
    # 
    # 5. autocorrect spellings
    #auto_correct = [spell(w) for w in meaningful_words]
    # 
    # 6. use stemmer to stem
    stem_words = [snowball_stemmer.stem(w) for w in meaningful_words]
    #
    # 7. use lemmatizer to lemmatize the words
    lemma_words = [wordnet_lemmatizer.lemmatize(w) for w in stem_words]
    #
    # 8. Join the words back into one string separated by space, 
    #    and return the result.
    return( " ".join(lemma_words))

In [75]:
def text_preprocess(df):
    #Get the number of reviews based on the dataframe column size
    num_text = df["text"].size

    # Initialize an empty list to hold the clean texts
    clean_train_text = []

    # Loop over each review; create an index i that goes from 0 to the length
    # of the text list 
    for i in range( 0, num_text ):
        # Call our function for each one, and add the result to the list of
        # clean reviews
        clean_train_text.append(clean_text(df["text"][i]))
        
    return clean_train_text

### word2vec training

In [101]:
def text_to_wordlist(text, remove_stopwords=False):
    # 1. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    #
    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split() 
    #
    # 3. In Python, searching a set is much faster than searching
    #    a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))
    # 
    # 4. Remove stop words
    if remove_stopwords:
        meaningful_words = [w for w in words if not w in stops]  
    else:
        meaningful_words = words
    # 
    # 5. autocorrect spellings
    #auto_correct = [spell(w) for w in meaningful_words]
    # 
    # 6. use stemmer to stem
    stem_words = [snowball_stemmer.stem(w) for w in meaningful_words]
    #
    # 7. use lemmatizer to lemmatize the words
    lemma_words = [wordnet_lemmatizer.lemmatize(w) for w in stem_words]
    #
    # 8. Join the words back into one string separated by space, 
    #    and return the result.
    return(lemma_words)

In [103]:
# Define a function to split a review into parsed sentences
# we needd sentences because word2vec takes sentences as input. It leverages the SBD
def text_to_sentences(text, tokenizer,remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(text.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call text_to_wordlist to get a list of words
            sentences.append(text_to_wordlist(raw_sentence,remove_stopwords))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [107]:
def parse_and_clean_sentences(df):
    sentences = []  # Initialize an empty list of sentences

    print ("Parsing sentences from training set")
    for text in df["text"]:
        sentences += text_to_sentences(text, tokenizer)
    print ("parsing done!")   
    return sentences

### Train and save word2vec
With the list of nicely parsed sentences, we're ready to train the model. There are a number of parameter choices that affect the run time and the quality of the final model that is produced. For details on the algorithms below, see the word2vec API documentation as well as the Google documentation. 

* Architecture: Architecture options are skip-gram (default) or continuous bag of words. We found that skip-gram was very slightly slower but produced better results.
* Training algorithm: Hierarchical softmax (default) or negative sampling. For us, the default worked well.
* Downsampling of frequent words: The Google documentation recommends values between .00001 and .001. For us, values closer 0.001 seemed to improve the accuracy of the final model.
* Word vector dimensionality: More features result in longer runtimes, and often, but not always, result in better models. Reasonable values can be in the tens to hundreds; we used 300.
* Context / window size: How many words of context should the training algorithm take into account? 10 seems to work well for hierarchical softmax (more is better, up to a point).
* Worker threads: Number of parallel processes to run. This is computer-specific, but between 4 and 6 should work on most systems.
* Minimum word count: This helps limit the size of the vocabulary to meaningful words. Any word that does not occur at least this many times across all documents is ignored. Reasonable values could be between 10 and 100.

In [122]:
def train_word2vec(df, num_features = 300, min_word_count = 1, num_workers = 4, context = 4, downsampling = 1e-3):
    
    from gensim.models import word2vec
    # parse and clean sentence
    sentences = parse_and_clean_sentences(df)
    
    #initialize logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

    # Initialize and train the model (this will take some time)

    print ("Training model...")
    model = word2vec.Word2Vec(sentences, workers=num_workers,size=num_features, min_count = min_word_count,
                              window = context, sample = downsampling)

    # init_sims will make the model much more memory-efficient.
    model.init_sims(replace=True)

    # save the model for later use.can load it later using Word2Vec.load()
    model_name = "trainedWord2vecmodel"
    model.save(model_name)
    print ("model saved as", model_name)
    
    return model

In [126]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 1    # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 4           # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [127]:
##import dataset
df = pd.read_csv("test.csv", header=0, delimiter="\t", quoting=3)

In [128]:
model = train_word2vec(df, num_features, min_word_count, num_workers, context, downsampling)

2017-11-03 20:52:43,944 : INFO : collecting all words and their counts
2017-11-03 20:52:43,945 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-11-03 20:52:43,946 : INFO : collected 20 word types from a corpus of 26 raw words and 4 sentences
2017-11-03 20:52:43,947 : INFO : Loading a fresh vocabulary
2017-11-03 20:52:43,948 : INFO : min_count=1 retains 20 unique words (100% of original 20, drops 0)
2017-11-03 20:52:43,949 : INFO : min_count=1 leaves 26 word corpus (100% of original 26, drops 0)
2017-11-03 20:52:43,950 : INFO : deleting the raw counts dictionary of 20 items
2017-11-03 20:52:43,951 : INFO : sample=0.001 downsamples 20 most-common words
2017-11-03 20:52:43,952 : INFO : downsampling leaves estimated 4 word corpus (15.9% of prior 26)
2017-11-03 20:52:43,953 : INFO : estimated required memory for 20 words and 300 dimensions: 58000 bytes
2017-11-03 20:52:43,954 : INFO : resetting layer weights
2017-11-03 20:52:43,955 : INFO : training model with

Parsing sentences from training set
parsing done!
Training model...
model saved as trainedWord2vecmodel


In [131]:
model.most_similar("hi")

[('wher', 0.12596893310546875),
 ('are', 0.08614814281463623),
 ('shim', 0.06376539170742035),
 ('tell', 0.06363306939601898),
 ('have', 0.05454188585281372),
 ('me', 0.04825294017791748),
 ('is', 0.03167020156979561),
 ('a', 0.017723508179187775),
 ('when', 0.01742667891085148),
 ('can', 0.006341244094073772)]

In [133]:
from gensim.models import Word2Vec
model = Word2Vec.load("trainedWord2vecmodel")

2017-11-03 21:18:30,746 : INFO : loading Word2Vec object from trainedWord2vecmodel
2017-11-03 21:18:30,751 : INFO : loading wv recursively from trainedWord2vecmodel.wv.* with mmap=None
2017-11-03 21:18:30,752 : INFO : setting ignored attribute syn0norm to None
2017-11-03 21:18:30,754 : INFO : setting ignored attribute cum_table to None
2017-11-03 21:18:30,756 : INFO : loaded trainedWord2vecmodel


In [143]:
# model.wv.syn0.shape
# model.wv.index2word
# model["word"]

In [None]:
##A simple way to assign a word2vec vector to a document is to take a mean of its words.
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec