In [95]:
import pandas as pd
import numpy as np
import nltk
import gensim
import re
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from gensim import corpora
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df = pd.read_excel('hvac_all_issues.xlsx')

In [4]:
len(df)

43041

In [8]:
def combine_columns(df,cols):
    all_docs = []
    for index, row in df.iterrows():
        tempstr = ''
        for col in cols:
            try:
                tempstr += str(row[col]) + ' '
            except:
                print('Column Not Found : ', col)
        all_docs.append(tempstr)
    return all_docs        

In [10]:
issue_text = combine_columns(df, ['Action.Requested', 'General.Comments', 'Task.Comments'])

In [11]:
issue_text[0:5]

['Portable Generator Maintenance-Monthly nan nan ',
 'Roof Drain Quarterly Maintenance nan nan ',
 'Marcos with SACS called to report the ballrooms are too warm for an event going on until 9pm. Spoke to Don at CP. nan 2017-05-19 - 18290 - Donald Miller-Checked la sala and made adjustments to temperature setpoint.  Followed up, ok - Donald Miller ',
 'The sink in our lobby area is not working. The sensor turns water on but it does not stay on. nan 2017-05-18 - 23070 - Tim Jowers-pulled it down and cleaned the strainers also reset the sensor that controls the faucet. called the factory to lean how to reset the control to set time of run, not covered in the paperwork. - Tim Jowers ',
 'Please adjust the temperature in the lab to approx 75 Degrees. They say its tooooo cold. Thank you. nan 2017-05-18 - 18300 - Chad T Ball-Adjusted set point to 75. Damper was found in operator control. Released damper to automatic control. - Chad T Ball ']

In [20]:
def remove_punctutation(all_docs):
    no_punctuation_docs = []
    for doc in all_docs:
        doc = doc.replace("-", " ")
        no_punctuation_docs.append(re.sub(r'[^\w\s]','',doc))
    return no_punctuation_docs

In [14]:
def tokenizer(all_docs):
    tokenized_docs = []
    for doc in all_docs:
        tokenized_docs.append(word_tokenize(doc))
    return tokenized_docs

In [15]:
def lemmatizer(all_docs):
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_docs = []
    for doc in all_docs:
        temp = []
        for token in doc:
            if token.isalpha():
                temp.append(wordnet_lemmatizer.lemmatize(token,"v"))
        lemmatized_docs.append(temp)
    return lemmatized_docs

In [16]:
def stemmer(all_docs):
    porter_stemmer = PorterStemmer()
    stemmed_docs = []
    for doc in all_docs:
        temp = []
        for token in doc:
            temp.append(porter_stemmer.stem(token))
        stemmed_docs.append(temp)
    return stemmed_docs   

In [17]:
def untokenizer(all_docs):
    untokenized_docs = []
    for doc in all_docs:
        untokenized_docs.append(" ".join(doc))
    return untokenized_docs

In [30]:
def create_dtm(all_docs,stopwords,ngram):
    vectorizer = CountVectorizer(lowercase=True,min_df=5,ngram_range=(1, ngram),stop_words=stopwords)
    dtm = vectorizer.fit_transform(all_docs)
    return vectorizer,dtm

In [31]:
def tfidf_transformer(dtm):
    tf_transformer = TfidfTransformer()
    tfidf = tf_transformer.fit_transform(dtm)
    return tf_transformer,tfidf

In [24]:
#REMOVE PUNCTUATION
punctuation_issues = remove_punctutation(issue_text)

In [25]:
#Tokenize each issue text
tokenized_issues = tokenizer(punctuation_issues)

In [26]:
#lemmatize the tokens
lemmatized_issues = lemmatizer(tokenized_issues)

In [27]:
#Stem the words
stemmed_issues = stemmer(lemmatized_issues)

In [28]:
#Untokenize the tokens to form sentence again
untokenized_issues = untokenizer(stemmed_issues)

In [29]:
stop_words = stopwords.words('english')
stop_words.append('nan')

In [32]:
vectorizer, dtm = create_dtm(untokenized_issues,stop_words,1)

In [33]:
tf_transformer, tfidf = tfidf_transformer(dtm)

In [34]:
np.shape(tfidf)

(43041, 4334)

In [45]:
vectorizer = TfidfVectorizer(stop_words=stop_words, use_idf=True, ngram_range=(1,1))
# tokenize and build vocab
tfidf_matrix = vectorizer.fit_transform(untokenized_issues)

In [46]:
np.shape(tfidf_matrix)

(43041, 14562)

In [48]:
tfidf_dense = tfidf_matrix.todense()

In [49]:
vectorizer.vocabulary_

{'portabl': 9639,
 'gener': 5252,
 'mainten': 7584,
 'monthli': 8148,
 'roof': 10869,
 'drain': 3894,
 'quarterli': 10054,
 'marco': 7665,
 'sac': 11075,
 'call': 1820,
 'report': 10544,
 'ballroom': 1001,
 'warm': 13964,
 'event': 4459,
 'go': 5341,
 'spoke': 12019,
 'cp': 3040,
 'donald': 3804,
 'miller': 8007,
 'check': 2158,
 'la': 6986,
 'sala': 11101,
 'make': 7593,
 'adjust': 186,
 'temperatur': 12691,
 'setpoint': 11437,
 'follow': 4960,
 'ok': 8774,
 'sink': 11715,
 'lobbi': 7364,
 'area': 667,
 'work': 14318,
 'sensor': 11376,
 'turn': 13290,
 'water': 14005,
 'stay': 12182,
 'tim': 12950,
 'jower': 6688,
 'pull': 9938,
 'clean': 2369,
 'strainer': 12275,
 'also': 457,
 'reset': 10621,
 'control': 2866,
 'faucet': 4670,
 'factori': 4592,
 'lean': 7144,
 'set': 11431,
 'time': 12953,
 'run': 11034,
 'cover': 3026,
 'paperwork': 9177,
 'pleas': 9508,
 'lab': 6987,
 'approx': 631,
 'degre': 3435,
 'say': 11185,
 'tooooo': 13078,
 'cold': 2543,
 'thank': 12793,
 'chad': 2082,
 'b

In [64]:
# Train Word2Vec Model from all the tokenized documents after stemming
model = gensim.models.Word2Vec(stemmed_issues, min_count=1, size=100)
model.save('word2vec_model')

In [63]:
stemmed_issues[0]

['portabl', 'gener', 'mainten', 'monthli', 'nan', 'nan']

In [65]:
#TODO: Need to remove stop words
len(model.wv.vocab)

15026

In [69]:
np.shape(model.wv['mainten'])

(100,)

In [87]:
def generate_sentence_vector(tokens, model, vectorizer, tfidf_dense):
    vector = np.zeros(model.vector_size)
    for token in tokens:
        if token in model.wv.vocab and token in vectorizer.vocabulary_:
            vector = vector + model.wv[token] * tfidf_dense[0,vectorizer.vocabulary_[token]]
    return vector
            
    

In [88]:
issue_features = []
for i in range(0,len(stemmed_issues)):
    
    issue_features.append(generate_sentence_vector(stemmed_issues[i], model, vectorizer, tfidf_dense[i]))
np.shape(issue_features)

(43041, 100)

In [90]:
list(df)

['Work.Order',
 'WO.Type',
 'Charge.Type',
 'Reference.Number',
 'Project',
 'Action.Requested',
 'Priority',
 'Status',
 'Requestor',
 'Req.Phone',
 'Department',
 'Dept.Number',
 'Account.Number.s.',
 'Campus',
 'Building',
 'Floor',
 'Area',
 'Area.Function',
 'Area.Type',
 'Object.Type',
 'WO.Object',
 'Date.Requested',
 'Date.Estimate.Began',
 'Date.Estimate.Was.Completed',
 'Days.to.Estimate.Completed',
 'Date.WO.Issued',
 'Days.to.Issue',
 'Scheduled.Start.Date',
 'Date.Funded',
 'Funded.Amount',
 'First.Charge',
 'Days.to.Start',
 'Scheduled.Completion.Date',
 'Date.Finished',
 'Days.to.Finish',
 'Date.Completed',
 'Days.to.Completion',
 'Labor.Hours',
 'FM.Labor',
 'Flat.Bill',
 'Warehouse.Parts',
 'NonStock.Received.and.Invoiced',
 'NonStock.Received.Not.Invoiced',
 'NonStock.On.Order.Not.Received',
 'Total',
 'Assigned.Trades',
 'General.Comments',
 'Task.Comments',
 'Topics',
 'filter',
 'total',
 'vent',
 'Gauges',
 'Thermometers',
 'thermocouple',
 'Thermostat',
 'cage',


In [92]:
issue_vector_df = pd.DataFrame()
issue_vector_df['Work.Order'] = df['Work.Order']
issue_vector_df['Action.Requested'] = df['Action.Requested']
issue_vector_df['General.Comments'] = df['General.Comments']
issue_vector_df['Task.Comments'] = df['Task.Comments']
issue_vector_df['Building'] = df['Building']
issue_vector_df['Campus'] = df['Campus']
issue_vector_df['Total'] = df['Total']
issue_vector_df['Text.Vector'] = issue_features

In [94]:
issue_vector_df.head()

Unnamed: 0,Work.Order,Action.Requested,General.Comments,Task.Comments,Building,Campus,Total,Text.Vector
1,WEST-111222,Portable Generator Maintenance-Monthly,,,906 - CENTRAL SERVICES COMPLEX,West Campus,0.0,"[-0.223442912102, 0.775393560529, 1.0544666517..."
2,WEST-111219,Roof Drain Quarterly Maintenance,,,903 - UNIVERSITY CENTER - 903,West Campus,0.0,"[-1.36261639744, 2.07899846137, -0.37214660644..."
3,WEST-111213,Marcos with SACS called to report the ballroom...,,2017-05-19 - 18290 - Donald Miller-Checked la ...,903 - UNIVERSITY CENTER - 903,West Campus,46.72,"[0.816895168449, -0.226713138167, 0.1999728232..."
4,WEST-111208,The sink in our lobby area is not working. The...,,2017-05-18 - 23070 - Tim Jowers-pulled it down...,903 - UNIVERSITY CENTER - 903,West Campus,93.44,"[-1.29352224525, 0.387565576471, -2.3586586981..."
5,WEST-111206,Please adjust the temperature in the lab to ap...,,2017-05-18 - 18300 - Chad T Ball-Adjusted set ...,905 - CLASSROOM/LAB/COMPUTER CLASSROOM,West Campus,0.0,"[-0.398334584788, -3.6127059297, 1.25289469102..."


In [96]:
issue_vector_df.to_pickle('issue_text_vector.pkl')