In [2]:
import pandas as pd
import numpy as np

In [3]:
np.random.seed(0)
data = pd.read_excel('CSAT_final_data_for_category.xlsx')
data.columns = ['label', 'comment_text']
data['label']=data['label'].str.lower()
data['comment_text']=data['comment_text'].str.lower()
data.dropna(inplace=True)

In [4]:
data.head(3)

Unnamed: 0,label,comment_text
0,account,your site told me my address was an invalid ad...
1,checkout,it told me my address is not valid.guess you d...
2,business ops / customer service & stores,it's the most poorly designed website i have e...


In [5]:
data.label.value_counts()

other                                       3048
business ops / customer service & stores    2227
search & navigation                         1446
browse                                      1398
checkout                                     969
inventory                                    966
post purchase                                631
account                                      577
assortment                                   147
coupons & rewards                             80
other                                         38
positive promoters                            13
pricing                                        2
Name: label, dtype: int64

In [6]:
## Since coupons , others, promotors and pricing are very few in count , we will merge it into other category
data['label'] = np.where(data.label == 'coupons & rewards','other', data.label)

In [7]:
data['label'] = np.where(data.label == 'positive promoters','other', data.label)
data['label'] = np.where(data.label == 'pricing','other', data.label)
data['label'] = np.where(data.label == 'other ','other', data.label)

In [8]:
data.label.value_counts()

other                                       3181
business ops / customer service & stores    2227
search & navigation                         1446
browse                                      1398
checkout                                     969
inventory                                    966
post purchase                                631
account                                      577
assortment                                   147
Name: label, dtype: int64

In [12]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re

In [13]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [14]:
# apply the preprocess function to all reviews
data['comment_text'] = data['comment_text'].apply(preprocessor)

In [11]:
data.head(3)

Unnamed: 0,label,comment_text
0,account,your site told me my address was an invalid ad...
1,checkout,it told me my address is not valid guess you d...
2,business ops / customer service & stores,it s the most poorly designed website i have e...


In [15]:
#import and instantiate Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vect=CountVectorizer()

In [16]:
#Defining X and y variables
X=data.comment_text
y=data.label
print (X.shape)
print (y.shape)

(11542,)
(11542,)


In [17]:
#Splitting X, y into training and test  datasets
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=1)

In [18]:
#Learning the vocab of entire data
vect.fit(X)
#transforming the vocab into Document term matrix
X_train_dtm=vect.transform(X_train)

In [19]:
#Examine the Document Term Matrix
X_train_dtm
X_train_dtm.shape

(8656, 13149)

In [20]:
#Transforming and creating DTM of test data
X_test_dtm=vect.transform(X_test)
X_test_dtm.shape

(2886, 13149)

In [21]:
#Importing and instantiating the NB classifier
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()

In [22]:
#training the model using train data DTM
%time nb.fit(X_train_dtm,y_train)

CPU times: user 40 ms, sys: 0 ns, total: 40 ms
Wall time: 38.5 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
#making predictions on test data
y_pred_class=nb.predict(X_test_dtm)

In [24]:
#Examine accuracy of predictions 
from sklearn.metrics import confusion_matrix,classification_report
print (confusion_matrix(y_test,y_pred_class))


[[ 25   0   3  17   6   2  86   3   4]
 [  0   1   3  13   1   1  13   0   5]
 [  1   0 195  29  10   5  96   2  16]
 [  3   0  18 339  10  13 138   6  55]
 [  2   1  20  30  65   4 118   1  14]
 [  0   0  10  38   1 115  55   0  21]
 [  7   3  62 164  20  24 418  14  70]
 [  0   0   1  37   4   3  77  31   3]
 [  2   0  23  57   2   7  90   1 152]]


In [25]:
print (classification_report(y_test,y_pred_class))

                                          precision    recall  f1-score   support

                                 account       0.62      0.17      0.27       146
                              assortment       0.20      0.03      0.05        37
                                 browse        0.58      0.55      0.57       354
business ops / customer service & stores       0.47      0.58      0.52       582
                                checkout       0.55      0.25      0.35       255
                               inventory       0.66      0.48      0.56       240
                                   other       0.38      0.53      0.45       782
                           post purchase       0.53      0.20      0.29       156
                     search & navigation       0.45      0.46      0.45       334

                             avg / total       0.49      0.46      0.45      2886

