In [1]:
# Import the pandas package, then use the "read_csv" function to read
# the labeled training data
import pandas as pd       
train = pd.read_csv("E:/ASU Energy Leadership Informatics/neptune files/Maintenance Requests/train.csv", header=0, encoding="cp1252", low_memory=False)

In [2]:
#Dimensions of the dataframe
train.shape

(20676, 7)

In [3]:
#Displaying Column labels
train.columns.values

array(['Unnamed: 0', 'WorkOrder', 'WOType', 'ActionRequested', 'Priority',
       'WO', 'WOPriority'], dtype=object)

### Data Cleaning and Text Preprocessing

In [4]:
# Import BeautifulSoup into your workspace
from bs4 import BeautifulSoup             
import re
import nltk
# Download text data sets, including stop words
from nltk.corpus import stopwords


In [5]:
#Function to reuse the code
def maintenance( tma ):
    # Function to convert a raw action request to a string of words
    # The input is a single string, and 
    # the output is a single string (a processed action request)
    #
    # 1. Remove HTML
    text = BeautifulSoup(tma,"html.parser").get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))   

In [6]:
# Get the number of requests based on the dataframe column size
num_requests = train["ActionRequested"].size

# Initialize an empty list to hold the clean requests
clean_train_requests = []

# Loop over each request; create an index i that goes from 0 to the length
# of the maintenance request list 
for i in range( 0, num_requests ):
    # Call our function for each one, and add the result to the list of
    # clean requests
    clean_train_requests.append(maintenance( train["ActionRequested"][i] ) )

### Creating Features from a Bag of Words

In [7]:
print( "Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_requests)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

Creating the bag of words...



In [10]:
#tdf = pd.DataFrame(train_data_features)
#tdf.to_csv("E:/ASU Energy Leadership Informatics/neptune files/Maintenance Requests/wordvectors.csv")

In [8]:
print(train_data_features.shape)

(20676, 5000)


In [10]:
import numpy as np

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)


### Random Forest

In [13]:
print("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100, max_features=70) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable

forest1 = forest.fit( train_data_features, train["WO"] )

forest2 = forest.fit(train_data_features, train["WOPriority"])

Training the random forest...


### Making predictions on the test set

In [14]:
# Read the test data
test = pd.read_csv("E:/ASU Energy Leadership Informatics/neptune files/Maintenance Requests/test.csv", header=0, encoding="cp1252", low_memory=False)


In [15]:
print(test.shape)

(8055, 7)


In [16]:
# Create an empty list and append the clean requests one by one
num_requests = len(test["ActionRequested"])
clean_test_requests = [] 


In [17]:
print("Cleaning and parsing the test set maintenance requests...\n")
for i in range(0,num_requests):
    clean_requests = maintenance( test["ActionRequested"][i] )
    clean_test_requests.append( clean_requests )

Cleaning and parsing the test set maintenance requests...



In [18]:
# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_requests)
test_data_features = test_data_features.toarray()

In [15]:
##tedf = pd.DataFrame(test_data_features)
##tedf.to_csv("E:/ASU Energy Leadership Informatics/neptune files/Maintenance Requests/wordvectorstest.csv")

In [22]:
#Obtaining predictions by cross-validation
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

In [23]:
#Performing 10 fold cross-validation on the data
predicted1 = cross_val_predict(forest1, train_data_features, train["WO"], cv=10)
metrics.accuracy_score(train["WO"], predicted1) 

0.84213580963435863

In [24]:
#Performing 10 fold cross-validation on the data
predicted2 = cross_val_predict(forest2, train_data_features, train["WOPriority"], cv=10)
metrics.accuracy_score(train["WOPriority"], predicted2) 

0.80494292899980657

In [25]:
# Use the random forest to make predictions for work order types and priorities of requests
result1 = forest1.predict(test_data_features)
result2 = forest2.predict(test_data_features)

In [30]:
# Copy the results to a pandas dataframe 
output1 = pd.DataFrame( data={"id":test["WorkOrder"], "Predicted_WO":result1, "Actual_WO":test["WO"] } )
output2 = pd.DataFrame( data={"id":test["WorkOrder"], "Predicted_Priority":result2, "Actual_Priority":test["WOPriority"]} )

In [31]:
# Use pandas to write the comma-separated output file
output1.to_csv( "E:/ASU Energy Leadership Informatics/neptune files/Maintenance Requests/TestSetPredictions/WOtypes.csv", index=False, quoting=3 )
output2.to_csv( "E:/ASU Energy Leadership Informatics/neptune files/Maintenance Requests/TestSetPredictions/Prioritytypes.csv", index=False, quoting=3 )
