In [1]:
# Import the pandas package, then use the "read_csv" function to read
# the labeled training data
import pandas as pd       
tontozona = pd.read_csv("E:/ASU Energy Leadership Informatics/neptune files/Maintenance Requests/Campus CSV Files/Topic Modeling/TONTOZONA.csv", header=0, encoding="cp1252", low_memory=False)

In [2]:
import gensim
from gensim import corpora



In [3]:
# Import BeautifulSoup into workspace
from bs4 import BeautifulSoup             
import re
import nltk
# Download text data sets, including stop words
from nltk.corpus import stopwords

In [4]:
#Function to reuse the code
def maintenance(tma):
    # Function to convert a raw action request to a string of words
    # The input is a single string, and 
    # the output is a single string (a processed action request)
    #
    # 1. Remove HTML
    text = BeautifulSoup(tma,"html.parser").get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-z A-Z]", " ", text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))   

In [5]:
# Get the number of requests based on the dataframe column size
num_requests = tontozona["ActionRequested"].size

# Initialize an empty list to hold the clean requests
clean_train_requests = []

# Loop over each request; create an index i that goes from 0 to the length
# of the maintenance request list 
for i in range( 0, num_requests ):
    # Call our function for each one, and add the result to the list of
    # clean requests
    clean_train_requests.append(maintenance( tontozona["ActionRequested"][i] ) )

In [6]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in tontozona["ActionRequested"]]    

In [7]:
# Creating the term dictionary of our corpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

In [8]:
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [9]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

In [10]:
# Running and Training LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=6, id2word = dictionary, passes=100, alpha=0.001)

In [11]:
print(ldamodel.print_topics(num_topics=6, num_words=6))

[(0, '0.285*"furnacebiannual" + 0.267*"equipmentbiannual" + 0.267*"dx" + 0.002*"water" + 0.002*"heaterannual" + 0.002*"walkin"'), (1, '0.187*"walkin" + 0.094*"freezerbiannual" + 0.094*"coolerbiannual" + 0.037*"contact" + 0.037*"compressorquarterly" + 0.025*"tom"'), (2, '0.220*"cooler" + 0.220*"biannual" + 0.220*"evaporative" + 0.037*"fountainannual" + 0.037*"drinking" + 0.007*"attachment"'), (3, '0.292*"air" + 0.185*"handlerbiannual" + 0.063*"compressorbiannual" + 0.018*"replaced" + 0.018*"hall" + 0.018*"adjust"'), (4, '0.280*"water" + 0.274*"heaterannual" + 0.046*"exhaust" + 0.046*"fanbiannual" + 0.035*"electrical" + 0.035*"heaterbiannual"'), (5, '0.226*"fire" + 0.220*"inspectionquarterly" + 0.220*"pump" + 0.019*"camp" + 0.013*"multiple" + 0.013*"reconstruction"')]


In [12]:
import pyLDAvis.gensim

In [13]:
vis1 =  pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]


In [14]:
pyLDAvis.display(vis1)

###                                                                                                         Word2Vec

In [15]:
#to train Word2Vec it is better not to remove stop words because the algorithm relies on the broader context of the sentence in order to produce high-quality word vectors. 
#Function to reuse the code
def maintenance( tma, remove_stopwords=False):
    # Function to convert a raw action request to a string of words
    # The input is a single string, and 
    # the output is a single string (a processed action request)
    #
    # 1. Remove HTML
    text = BeautifulSoup(tma,"html.parser").get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]                  
    
    # 5. Return the result.
    return( words)   

In [16]:
# Download the punkt tokenizer for sentence splitting
import nltk.data   

In [17]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [18]:
# Define a function to split an action request into parsed sentences
def request_to_sentences(tma, tokenizer, remove_stopwords=False):
    # Function to split a request into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(tma.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call maintenance to get a list of words
            sentences.append(maintenance( raw_sentence,\
              remove_stopwords))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [20]:
%%time
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for tma in tontozona["ActionRequested"]:
    sentences += request_to_sentences(tma, tokenizer)

Parsing sentences from training set
Wall time: 57 ms


Training the model
With the list of parsed sentences, we're ready to train the model. There are a number of parameter choices that affect the run time and the quality of the final model that is produced. For details on the algorithms below, see the word2vec API documentation as well as the Google documentation.

Architecture: Architecture options are skip-gram (default) or continuous bag of words. We found that skip-gram was very slightly slower but produced better results. Training algorithm: Hierarchical softmax (default) or negative sampling. For us, the default worked well. Downsampling of frequent words: 

The Google documentation recommends values between .00001 and .001. For us, values closer 0.001 seemed to improve the accuracy of the final model. Word vector dimensionality: More features result in longer runtimes, and often, but not always, result in better models. Reasonable values can be in the tens to hundreds; we used 300. Context / window size: How many words of context should the training algorithm take into account? 

10 seems to work well for hierarchical softmax (more is better, up to a point). Worker threads: Number of parallel processes to run. This is computer-specific, but between 4 and 6 should work on most systems. Minimum word count: This helps limit the size of the vocabulary to meaningful words. Any word that does not occur at least this many times across all documents is ignored. Reasonable values could be between 10 and 100. 

In [23]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

In [24]:
# Set values for various parameters
num_features = 10000    # Word vector dimensionality                      
min_word_count = 10   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

In [25]:
%%time
# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)


2017-07-12 15:01:11,132 : INFO : collecting all words and their counts
2017-07-12 15:01:11,135 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-07-12 15:01:11,138 : INFO : collected 129 word types from a corpus of 896 raw words and 260 sentences
2017-07-12 15:01:11,140 : INFO : Loading a fresh vocabulary
2017-07-12 15:01:11,142 : INFO : min_count=10 retains 17 unique words (13% of original 129, drops 112)
2017-07-12 15:01:11,144 : INFO : min_count=10 leaves 685 word corpus (76% of original 896, drops 211)
2017-07-12 15:01:11,147 : INFO : deleting the raw counts dictionary of 129 items
2017-07-12 15:01:11,154 : INFO : sample=0.001 downsamples 17 most-common words
2017-07-12 15:01:11,156 : INFO : downsampling leaves estimated 96 word corpus (14.2% of prior 685)
2017-07-12 15:01:11,158 : INFO : estimated required memory for 17 words and 10000 dimensions: 1368500 bytes
2017-07-12 15:01:11,206 : INFO : resetting layer weights
2017-07-12 15:01:11,284 : INFO : t

Training model...


2017-07-12 15:01:11,366 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-07-12 15:01:11,371 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-07-12 15:01:11,379 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-07-12 15:01:11,486 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-07-12 15:01:11,488 : INFO : training on 4480 raw words (427 effective words) took 0.2s, 2378 effective words/s


Wall time: 361 ms


In [48]:
model.most_similar("fire")

[('inspection', 0.02311844937503338),
 ('equipment', 0.013965419493615627),
 ('quarterly', 0.012161565013229847),
 ('handler', 0.011318184435367584),
 ('water', 0.010789703577756882),
 ('biannual', 0.010148197412490845),
 ('air', 0.006586567964404821),
 ('evaporative', 0.004941080696880817),
 ('dx', 0.004720899276435375),
 ('in', 0.0017127208411693573)]