### import data and packages

In [1]:

import numpy as np
import pandas as pd
import nltk

In [2]:
data=pd.read_excel('D:/BankReviews.xlsx')

In [3]:
data

Unnamed: 0,Date,Stars,Reviews,BankName
0,2017-04-10,5,"Great job, Wyndham Capital! Each person was pr...",Wyndham Capital Mortgage
1,2017-02-10,5,Matthew Richardson is professional and helpful...,Wyndham Capital Mortgage
2,2017-08-21,5,We had a past experience with Wyndham Mortgage...,Wyndham Capital Mortgage
3,2017-12-17,5,We have been dealing with Brad Thomka from the...,Wyndham Capital Mortgage
4,2016-05-27,5,I can't express how grateful I am for the supp...,Wyndham Capital Mortgage
...,...,...,...,...
500,2016-02-06,1,\r\nI never write reviews but had to this time...,North American Savings Bank
501,2016-07-25,1,\r\nIt all started when Bob G ran a credit che...,North American Savings Bank
502,2017-09-27,1,\r\nWhat a horrible experience. We have excell...,North American Savings Bank
503,2017-12-24,1,"\r\nRep was extremely professional, friendly, ...",North American Savings Bank


### 1.Data Processing

In [4]:
corpus = data['Reviews']


In [5]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [6]:
# get all words in the corpus also remove punctuations
from nltk import RegexpTokenizer
tokenizer = nltk.RegexpTokenizer(r"\w+").tokenize
all_words = []
for review in corpus:
    words = tokenizer(review)
    all_words = all_words+words

In [7]:
# remove stop words
from nltk.corpus import stopwords 
stop = set(stopwords.words("english"))
all_words = [w for w in all_words if w not in stop]
all_words

['Great',
 'job',
 'Wyndham',
 'Capital',
 'Each',
 'person',
 'professional',
 'helped',
 'us',
 'move',
 'refinance',
 'process',
 'smoothly',
 'Thank',
 'Matthew',
 'Richardson',
 'professional',
 'helpful',
 'He',
 'helped',
 'us',
 'find',
 'correct',
 'product',
 'mortgage',
 'Thank',
 'much',
 'excellent',
 'service',
 'Matthew',
 'We',
 'past',
 'experience',
 'Wyndham',
 'Mortgage',
 'would',
 'without',
 'question',
 'use',
 'needed',
 'Wyndham',
 'went',
 'beyond',
 'extra',
 'mile',
 'right',
 'wrong',
 'encountered',
 'servicer',
 'dealing',
 'previous',
 'loan',
 'pulled',
 'together',
 'found',
 'viable',
 'option',
 'us',
 'ultimately',
 'saved',
 'us',
 'money',
 'We',
 'would',
 'highly',
 'recommend',
 'Brad',
 'Thomka',
 'Wyndham',
 'Capital',
 'Mortgage',
 'team',
 'mortgage',
 'needs',
 'Sincerest',
 'thanks',
 'Wyndham',
 'Ed',
 'Lind',
 'We',
 'dealing',
 'Brad',
 'Thomka',
 'beginning',
 'started',
 'stressful',
 'time',
 'us',
 'help',
 'Brad',
 'entire',
 'Wy

In [8]:
# lemmatize the words
from nltk.stem import WordNetLemmatizer 
lmtzr = WordNetLemmatizer() 


all_words_lmtzr = []

for word in all_words:
    all_words_lmtzr.append(lmtzr.lemmatize(word))

In [9]:
len(all_words_lmtzr)

22071

In [10]:
# get frequency count for all words
from collections import Counter
df_freq = pd.DataFrame({"Words": list(Counter(all_words).keys()), "Counts": list(Counter(all_words).values())})


In [11]:
df_freq.head()

Unnamed: 0,Words,Counts
0,Great,31
1,job,25
2,Wyndham,16
3,Capital,11
4,Each,2


### 2.Key Positive/Negative words 

In [12]:
import nltk.sentiment.vader as senti


In [13]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\prash\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [14]:
list_senti=[]
for i in df_freq.Words:
    list_senti.append(sid.polarity_scores(i))

In [15]:
list_senti=pd.DataFrame(list_senti)

In [16]:
list_senti
final_senti=pd.concat((df_freq,list_senti),axis=1)

In [17]:
final_senti

Unnamed: 0,Words,Counts,neg,neu,pos,compound
0,Great,31,0.0,0.0,1.0,0.6249
1,job,25,0.0,1.0,0.0,0.0000
2,Wyndham,16,0.0,1.0,0.0,0.0000
3,Capital,11,0.0,1.0,0.0,0.0000
4,Each,2,0.0,1.0,0.0,0.0000
...,...,...,...,...,...,...
2895,grandson,1,0.0,1.0,0.0,0.0000
2896,snail,1,0.0,1.0,0.0,0.0000
2897,kindest,1,0.0,1.0,0.0,0.0000
2898,empathetic,1,0.0,0.0,1.0,0.4019


#### Most frequent positive words

In [18]:
final_senti.sort_values(by=['pos','Counts'],ascending=[False,False])

Unnamed: 0,Words,Counts,neg,neu,pos,compound
54,recommend,152,0.0,0.0,1.0,0.3612
273,great,108,0.0,0.0,1.0,0.6249
245,best,88,0.0,0.0,1.0,0.6369
148,easy,63,0.0,0.0,1.0,0.4404
16,helpful,56,0.0,0.0,1.0,0.4215
...,...,...,...,...,...,...
2894,illiterate,1,0.0,1.0,0.0,0.0000
2895,grandson,1,0.0,1.0,0.0,0.0000
2896,snail,1,0.0,1.0,0.0,0.0000
2897,kindest,1,0.0,1.0,0.0,0.0000


#### Most frequent negative words

In [19]:
final_senti.sort_values(by=['neg','Counts'],ascending=[False,False])

Unnamed: 0,Words,Counts,neg,neu,pos,compound
639,hard,42,1.0,0.0,0.0,-0.1027
1220,pay,25,1.0,0.0,0.0,-0.1027
65,stressful,24,1.0,0.0,0.0,-0.5106
423,lower,23,1.0,0.0,0.0,-0.2960
750,problem,18,1.0,0.0,0.0,-0.4019
...,...,...,...,...,...,...
2895,grandson,1,0.0,1.0,0.0,0.0000
2896,snail,1,0.0,1.0,0.0,0.0000
2897,kindest,1,0.0,1.0,0.0,0.0000
2898,empathetic,1,0.0,0.0,1.0,0.4019


## 3 Clssification of Reviews

In [20]:
corpus

0      Great job, Wyndham Capital! Each person was pr...
1      Matthew Richardson is professional and helpful...
2      We had a past experience with Wyndham Mortgage...
3      We have been dealing with Brad Thomka from the...
4      I can't express how grateful I am for the supp...
                             ...                        
500    \r\nI never write reviews but had to this time...
501    \r\nIt all started when Bob G ran a credit che...
502    \r\nWhat a horrible experience. We have excell...
503    \r\nRep was extremely professional, friendly, ...
504    \r\nI was working with a loan consultant from ...
Name: Reviews, Length: 505, dtype: object

In [21]:
from nltk import RegexpTokenizer

In [22]:
from nltk import RegexpTokenizer
tokenizer = nltk.RegexpTokenizer(r"\w+").tokenize
all_words = []
for review in corpus:
    sentence = []
    sentence.append(tokenizer(review))
    all_words.append(sentence)

In [23]:
from nltk.stem import WordNetLemmatizer

In [24]:
wc=WordNetLemmatizer()

In [25]:
for i in range(len(corpus)):
    words=nltk.word_tokenize(corpus[i])
    words=[wc.lemmatize(word) for word in words]
    corpus[i]=' '.join(words)
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [26]:
corpus

0      Great job , Wyndham Capital ! Each person wa p...
1      Matthew Richardson is professional and helpful...
2      We had a past experience with Wyndham Mortgage...
3      We have been dealing with Brad Thomka from the...
4      I ca n't express how grateful I am for the sup...
                             ...                        
500    I never write review but had to this time to p...
501    It all started when Bob G ran a credit check w...
502    What a horrible experience . We have excellent...
503    Rep wa extremely professional , friendly , and...
504    I wa working with a loan consultant from NASB ...
Name: Reviews, Length: 505, dtype: object

In [27]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()


In [28]:
corpus_senti=[]
for i in corpus:
    corpus_senti.append(sid.polarity_scores(i))

In [29]:
pd.DataFrame(corpus_senti)

Unnamed: 0,neg,neu,pos,compound
0,0.000,0.690,0.310,0.8011
1,0.000,0.693,0.307,0.8516
2,0.000,0.789,0.211,0.9595
3,0.019,0.750,0.231,0.9818
4,0.040,0.863,0.097,0.5569
...,...,...,...,...
500,0.086,0.801,0.113,0.9350
501,0.036,0.905,0.059,0.4065
502,0.161,0.722,0.116,-0.7970
503,0.012,0.846,0.142,0.9805


### 4.Identifying key themes

### By Topic modelling using LDA

In [30]:
import gensim

In [31]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prash\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
stemmer=SnowballStemmer('english')

In [33]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [34]:
processed_docs = data['Reviews'].map(preprocess)
processed_docs[:10]

0    [great, wyndham, capit, person, profession, he...
1    [matthew, richardson, profession, help, help, ...
2    [past, experi, wyndham, mortgag, question, nee...
3    [deal, brad, thomka, begin, start, stress, tim...
4    [express, grate, support, zach, provid, famili...
5    [pleasur, work, wyndham, capit, septemb, famil...
6    [experi, mattison, great, profession, care, cl...
7    [patrick, answer, question, email, immedi, spe...
8    [love, work, group, peopl, laugh, phone, answe...
9    [great, interfac, loan, applic, document, uplo...
Name: Reviews, dtype: object

In [35]:
doc_sample = data.Reviews[100]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))


original document: 
['Lisa', 'Podorson', 'ha', 'set', 'my', 'closing', 'date', 'three', 'different', 'date', ',', 'which', 'she', 'ha', 'reneged', 'on', 'since', 'August', '2017', '.', 'Ms.', 'Podorson', 'ha', 'continued', 'to', 'asked', 'for', 'document', 'at', 'the', 'last', 'minuet', '.', 'She', 'doe', 'not', 'return', 'text', 'or', 'phone', 'call', 'for', 'an', 'entire', 'week', '.', 'She', 'did', 'not', 'return', 'my', 'call', 'until', 'I', 'decided', 'to', 'cancel', 'the', 'refinancing', 'and', 'Contact', 'the', 'BBB', '.', 'She', 'ha', 'been', 'very', 'unprofessional', 'during', 'this', 'process', '.', 'This', 'experience', 'wa', 'very', 'unpleasant', '...', 'Read', 'More']


 tokenized and lemmatized document: 
['lisa', 'podorson', 'close', 'date', 'differ', 'date', 'reneg', 'august', 'podorson', 'continu', 'ask', 'document', 'minuet', 'return', 'text', 'phone', 'entir', 'week', 'return', 'decid', 'cancel', 'refin', 'contact', 'unprofession', 'process', 'experi', 'unpleas', 're

### Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.

In [36]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 capit
1 great
2 help
3 person
4 process
5 profession
6 refin
7 smooth
8 thank
9 wyndham
10 correct


# Filtering out tokens that appear in
## *less than 15 documents (absolute number) or
## *more than 0.5 documents (fraction of total corpus size, not absolute number)

In [37]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100)

### For each document we create a dictionary reporting how many words and how many times those words appear

In [38]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[50]

[(1, 2),
 (2, 1),
 (3, 1),
 (13, 1),
 (16, 1),
 (22, 1),
 (47, 1),
 (56, 1),
 (67, 1),
 (78, 1)]

In [39]:
#Preview
bow_doc_100 = bow_corpus[100]
for i in range(len(bow_doc_100)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_100[i][0], 
                                               dictionary[bow_doc_100[i][0]], 
bow_doc_100[i][1]))

Word 3 ("process") appears 1 time.
Word 5 ("refin") appears 1 time.
Word 12 ("experi") appears 1 time.
Word 21 ("entir") appears 1 time.
Word 35 ("close") appears 1 time.
Word 46 ("week") appears 1 time.
Word 51 ("phone") appears 1 time.
Word 55 ("document") appears 1 time.
Word 60 ("differ") appears 1 time.
Word 79 ("ask") appears 1 time.
Word 95 ("contact") appears 1 time.
Word 98 ("read") appears 1 time.


# Create tf-idf model object using models.TfidfModel

In [40]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.2799108301911471),
 (1, 0.2766749960867732),
 (2, 0.5031979284661511),
 (3, 0.18026193319570374),
 (4, 0.34306630631061447),
 (5, 0.3476505132900647),
 (6, 0.4834341708480756),
 (7, 0.2951853425392064)]


In [41]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [42]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.045*"work" + 0.041*"loan" + 0.037*"bank" + 0.034*"compani" + 0.032*"process" + 0.030*"time" + 0.030*"close" + 0.029*"friend" + 0.025*"help" + 0.025*"go"
Topic: 1 
Words: 0.053*"work" + 0.049*"loan" + 0.045*"process" + 0.038*"team" + 0.035*"profession" + 0.033*"time" + 0.031*"mortgag" + 0.028*"respons" + 0.027*"question" + 0.026*"great"
Topic: 2 
Words: 0.055*"recommend" + 0.052*"rate" + 0.045*"best" + 0.044*"mortgag" + 0.041*"experi" + 0.040*"process" + 0.035*"servic" + 0.030*"home" + 0.029*"compani" + 0.027*"offer"
Topic: 3 
Words: 0.064*"close" + 0.059*"time" + 0.052*"great" + 0.044*"home" + 0.040*"servic" + 0.036*"process" + 0.032*"team" + 0.028*"loan" + 0.028*"read" + 0.027*"nasb"
Topic: 4 
Words: 0.051*"help" + 0.051*"close" + 0.039*"process" + 0.035*"email" + 0.032*"loan" + 0.029*"week" + 0.029*"recommend" + 0.028*"read" + 0.027*"time" + 0.024*"work"
Topic: 5 
Words: 0.050*"close" + 0.048*"loan" + 0.036*"send" + 0.036*"lender" + 0.030*"rate" + 0.029*"servic" + 

In [43]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))


Topic: 0 Word: 0.044*"help" + 0.030*"friend" + 0.028*"thank" + 0.027*"possibl" + 0.027*"abl" + 0.026*"lender" + 0.025*"best" + 0.025*"great" + 0.025*"offer" + 0.023*"home"
Topic: 1 Word: 0.036*"respons" + 0.029*"close" + 0.027*"team" + 0.025*"time" + 0.024*"smooth" + 0.024*"question" + 0.024*"lender" + 0.023*"definit" + 0.023*"profession" + 0.021*"work"
Topic: 2 Word: 0.053*"best" + 0.053*"good" + 0.043*"deal" + 0.040*"hard" + 0.036*"work" + 0.033*"offic" + 0.031*"respons" + 0.028*"loan" + 0.020*"lender" + 0.020*"receiv"
Topic: 3 Word: 0.025*"work" + 0.025*"loan" + 0.024*"answer" + 0.022*"high" + 0.022*"contact" + 0.022*"inform" + 0.021*"process" + 0.020*"email" + 0.020*"patient" + 0.020*"hous"
Topic: 4 Word: 0.029*"pleasur" + 0.029*"communic" + 0.027*"profession" + 0.027*"document" + 0.024*"team" + 0.022*"excel" + 0.022*"question" + 0.022*"refin" + 0.021*"loan" + 0.020*"keep"
Topic: 5 Word: 0.042*"person" + 0.036*"loan" + 0.034*"team" + 0.032*"refin" + 0.029*"nasb" + 0.028*"profession

## Clustering

In [44]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [46]:
tf=TfidfVectorizer(stop_words='english')

In [47]:
tf.fit(data.Reviews.values)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [48]:
features=tf.transform(data.Reviews.values)

In [49]:
cls=KMeans(n_clusters=4,random_state=1234)

In [50]:
cls.fit(features)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=1234, tol=0.0001, verbose=0)

In [51]:
len(cls.labels_)

505

## 5.Predicting Star ratings

In [52]:
from sklearn.model_selection import train_test_split

In [53]:
train_x,test_x,train_y,test_y=train_test_split(data.Reviews,data.Stars,test_size=0.3,random_state=1234)

In [54]:
tf.fit(data.Reviews.values)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [55]:
Train_X_Tfidf = tf.transform(train_x)
Test_X_Tfidf = tf.transform(test_x)

### Using Naive Bayes 

In [56]:
from sklearn.naive_bayes import MultinomialNB

In [57]:
Naive = MultinomialNB()


In [58]:
Naive.fit(Train_X_Tfidf,train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [59]:
pred_y=Naive.predict(Test_X_Tfidf)

In [60]:
from sklearn import metrics

In [61]:
metrics.accuracy_score(pred_y,test_y)

0.8157894736842105

### Using SVM

In [62]:
from sklearn.svm import SVC

In [63]:
SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,train_y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [64]:
SVM.predict(Test_X_Tfidf)

array([5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 1, 1, 5, 1, 5, 5, 5, 5, 5, 5,
       5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 1, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 1, 1, 5, 5, 5, 5, 5,
       5, 5, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5,
       1, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 5, 1, 5, 1, 5, 5, 1, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 5, 5, 5, 5, 5, 5],
      dtype=int64)

In [65]:
metrics.accuracy_score(SVM.predict(Test_X_Tfidf),test_y)

0.9210526315789473

### 6.Intent Analysis 

-- Intent Analysis and Topic modelling are same and topic modelling is done already in part 4

In [66]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [67]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.082*"loan" + 0.054*"home" + 0.052*"great" + 0.050*"close" + 0.050*"nasb" + 0.044*"know" + 0.039*"team" + 0.031*"help" + 0.031*"time" + 0.030*"read"


Topic: 1 
Words: 0.053*"question" + 0.046*"process" + 0.045*"recommend" + 0.036*"answer" + 0.034*"help" + 0.032*"work" + 0.032*"rate" + 0.030*"home" + 0.025*"time" + 0.024*"experi"


Topic: 2 
Words: 0.064*"email" + 0.040*"thank" + 0.037*"lender" + 0.036*"receiv" + 0.032*"respons" + 0.032*"rate" + 0.026*"say" + 0.026*"home" + 0.026*"phone" + 0.025*"custom"


Topic: 3 
Words: 0.087*"work" + 0.053*"process" + 0.029*"refin" + 0.028*"great" + 0.028*"time" + 0.028*"read" + 0.026*"loan" + 0.025*"close" + 0.023*"recommend" + 0.022*"respons"


Topic: 4 
Words: 0.053*"close" + 0.052*"process" + 0.051*"loan" + 0.038*"work" + 0.037*"servic" + 0.037*"recommend" + 0.030*"week" + 0.029*"home" + 0.028*"help" + 0.027*"experi"


Topic: 5 
Words: 0.048*"bank" + 0.040*"work" + 0.040*"time" + 0.035*"go" + 0.034*"close" + 0.034*"document" +