In [8]:
import numpy as np
import pandas as pd
import re
!pip install nltk
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
nltk.download('wordnet')



[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
data = pd.read_csv("mindbody_nlp_challenge_data.csv")

In [10]:
X, y = data.message, data.intent

In [11]:
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()

#CountVectorizer is a great tool provided by the scikit-learn library in Python.
#It is used to transform a given text into a vector on the basis of the frequency (count)
#of each word that occurs in the entire text

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

#Tf–idf, short for term frequency–inverse document frequency, 
#is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold   #For K-fold cross validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics

In [16]:
classifier = RandomForestClassifier(n_estimators=10, random_state=0)
classifier.fit(X_train, y_train) 

RandomForestClassifier(n_estimators=10, random_state=0)

In [17]:
y_pred = classifier.predict(X_test)

In [18]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[3602    4    2    5  177    4   12   40   42   96   21    0]
 [   9    5    0    0    8    0    0    1    1    2    8    0]
 [  14    1  662    0   34    0    0    2    0   20   21    1]
 [   5    0    1  239   40    0    0    1    3    6    0    2]
 [ 171    2   21   33 2778   56  116   50   96  285   25    8]
 [  13    0    1    0   46  150    1    3    3    5    2    0]
 [   6    0    0    1  107    0 1600    2    4    9    1    0]
 [  92    0    3    0   85    2    1  136    1   18    8    1]
 [  58    1    1    0  145    1    8    0  768   26    1    1]
 [ 108    3   22   16  531    2   13   14   25 1019   14   17]
 [  44    3   26    0   52    0    1   10    1   29  915    2]
 [   1    0    3    1    7    0    2    0    0   23    3  255]]
                         precision    recall  f1-score   support

                   book       0.87      0.90      0.89      4005
     book_previous_appt       0.26      0.15      0.19        34
            cancel_appt       0.89      0.88   

In [19]:
#Used random forest because of the following reasons. -Random forests is great with high dimensional data.
#They are parallelizable, meaning that we can split the process to multiple machines to run. 
#It can handle binary features, categorical features, and numerical features.
#There is very little pre-processing that needs to be done. The data does not need to be rescaled or transformed.

In [22]:
#We can split the process to multiple machines to run.
#This results in faster computation time. -Each decision tree has a high variance, but low bias. 
#But because we average all the trees in random forest, we are averaging the variance as well so that we have a low bias and moderate variance model. 
#-Prediction speed is significantly faster than training speed because we can save generated forests for future us.