In [1]:
import spacy
import re

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = 'I always uh do the main um processing, I mean, the uh um data-processing.'

In [4]:
stats = nlp(doc)

In [5]:
for token in stats:
    print(token.text)

I
always
uh
do
the
main
um
processing
,
I
mean
,
the
uh
um
data
-
processing
.


In [6]:
doc2 = 'U.K. has a reasonable population'
stats = nlp(doc2)
for token in stats:
    print(token.text)

U.K.
has
a
reasonable
population


In [7]:
for token in re.split('\W+',doc):
    print(token)

I
always
uh
do
the
main
um
processing
I
mean
the
uh
um
data
processing



In [8]:
for token in re.split('\W+',doc2):
    print(token)

U
K
has
a
reasonable
population


In [9]:
L = list(nlp.vocab.strings)

In [10]:
len(L)

83431

In [11]:
L[50000]

'Barnabas'

In [12]:
L[60000]

'lathe'

In [13]:
L

['\t',
 'en',
 '\n',
 ' ',
 "'",
 "''",
 '"',
 "'Cause",
 'because',
 "'cause",
 'use',
 "'Xxxxx",
 'Cause',
 'cause',
 'C',
 'Xxxxx',
 "'Cos",
 "'cos",
 'Cos',
 "'Xxx",
 'cos',
 'Xxx',
 "'Coz",
 "'coz",
 'Coz',
 'coz',
 "'Cuz",
 "'cuz",
 'Cuz',
 'cuz',
 "'S",
 "'s",
 "'X",
 'S',
 's',
 "'bout",
 'about',
 'out',
 "'xxxx",
 'bout',
 'b',
 'xxxx',
 'c',
 "'xxx",
 'xxx',
 "'d",
 "'x",
 'd',
 'x',
 "'em",
 'them',
 "'xx",
 'em',
 'e',
 'xx',
 "'ll",
 'will',
 'll',
 'l',
 "'nuff",
 'enough',
 'uff',
 'nuff',
 'n',
 "'re",
 'are',
 're',
 'r',
 '(*_*)',
 '(',
 '_*)',
 ')',
 '*',
 '(-8',
 '(-d',
 '-8',
 '-',
 '-d',
 '(-:',
 ':',
 '(-;',
 ';',
 '(-_-)',
 '_-)',
 '-_-',
 '(._.)',
 '_.)',
 '.',
 '(:',
 '(;',
 '(=',
 '=',
 '(>_<)',
 '_<)',
 '>',
 '<',
 '(^_^)',
 '_^)',
 '^_^',
 '^',
 '(o:',
 '(x:',
 'o',
 '(¬_¬)',
 '_¬)',
 '¬_¬',
 '¬',
 '(ಠ_ಠ)',
 '_ಠ)',
 '(x_x)',
 'ಠ_ಠ',
 'ಠ',
 'x_x',
 '(╯°□°）╯︵┻━┻',
 '┻━┻',
 '┻',
 '╯',
 '━',
 '°',
 '□',
 '）',
 '︵',
 ')-:',
 '):',
 '-__-',
 '__-',
 '._.',
 '0.0

In [14]:
import numpy as np
import pandas as pd
from collections import Counter
import re

In [15]:
df = pd.read_csv('reviews.csv')

In [16]:
df.head()

Unnamed: 0,rating,review
0,negative,terrible place to work for i just heard a stor...
1,negative,"hours , minutes total time for an extremely s..."
2,negative,my less than stellar review is for service . w...
3,negative,i m granting one star because there s no way t...
4,negative,the food here is mediocre at best . i went aft...


In [17]:
df.tail()

Unnamed: 0,rating,review
55995,positive,"great food . wonderful , friendly service . i ..."
55996,positive,charlotte should be the new standard for moder...
55997,positive,get the encore sandwich ! ! make sure to get i...
55998,positive,i m a pretty big ice cream gelato fan . pretty...
55999,positive,where else can you find all the parts and piec...


In [18]:
vocab = {}

In [19]:
def initializeVocabulary():
    unkToken = '<UNK>'
    vocab['t_2_i'] = {}
    vocab['i_2_t'] = {}
    vocab['unkToken'] = unkToken
    idx = addToken(unkToken)
    vocab['unkTokenIdx'] = idx

In [20]:
def addToken(token):
    if token in vocab['t_2_i']:
        idx = vocab['t_2_i'][token]
    else:
        idx = len(vocab['t_2_i'])
        vocab['t_2_i'][token] = idx
        vocab['i_2_t'][idx] = token
    return idx

In [21]:
def addManyTokens(tokens):
    idxes = [addToken(token) for token in tokens]
    return idxes

In [22]:
def lookUpToken(token):
    if vocab['unkTokenIdx']>=0:
        return vocab['t_2_i'].get(token,vocab['unkTokenIdx'])
    else:
        return vocab['t_2_i'][token]

In [23]:
def lookUpIndex(idx):
    if idx not in vocab['i_2_t']:
        raise KeyError("the index (%d) is not there" % idx)
    return vocab['i_2_t'][idx]

In [24]:
def vocabularyFromDataFrame(df,cutoff=25):
    initializeVocabulary()
    wordCounts = Counter()
    for r in df.review:
        for word in re.split('\W+',r):
            wordCounts[word] += 1
    for word,count in wordCounts.items():
        if count > cutoff:
            addToken(word)

In [25]:
def vocabularyFromCorpus(Corpus,cutoff=25):
    initializeVocabulary()
    wordCounts = Counter()
    for doc in Corpus:
        for word in re.split('\W+',doc):
            wordCounts[word] += 1
    for word,count in wordCounts.items():
        if count > cutoff:
            addToken(word)

In [26]:
df = pd.read_csv('reviews.csv')

In [27]:
#vocabularyFromDataFrame(df)
Corpus = np.asarray(df['review'])
vocabularyFromCorpus(Corpus)

In [28]:
lookUpToken('the')

38

In [29]:
lookUpIndex(38)

'the'

In [30]:
len(vocab['t_2_i'])

8946

In [31]:
def oneHotVector(token,N):
    oneHot = np.zeros((N,1))
    oneHot[lookUpToken(token)] = 1
    return oneHot

In [32]:
N = len(vocab['t_2_i'])
token = 'the'
oneHot = oneHotVector(token,N)

In [33]:
oneHot[38]

array([1.])

In [34]:
def computeFeatures(doc,N):
    isFirst = True
    for token in doc:
        oneHot = oneHotVector(token,N)
        if isFirst:
            xF = oneHot
            isFirst = False
        else:
            xF = np.hstack((xF,oneHot))
    return np.mean(xF,axis=1)[:,np.newaxis]

In [68]:
def computeFeatures_fast(doc,N):
    fv = np.zeros(N)
    numTokens = 0
    for token in doc:
        fv[lookUpToken(token)] += 1
        numTokens += 1
    return fv/numTokens

In [36]:
def corpusToFeatureMatrix(Corpus,N):
    isFirst = True
    for doc in Corpus:
        fv = computeFeatures(doc,N)
        if isFirst:
            fM = fv
            isFirst = False
        else:
            fM = np.hstack((fM,fv))
    return fM.T

In [37]:
def corpusToFeatureMatrix_fast(Corpus,N):
    fM = np.zeros((N,len(Corpus)))
    i = 0
    for doc in Corpus:
        fM[:,i] = computeFeatures_fast(doc,N)
        i+=1
    return fM.T

In [38]:
%timeit fv = computeFeatures_fast(Corpus[0],len(vocab['t_2_i']))

682 µs ± 56.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [39]:
%timeit fv = computeFeatures(Corpus[0],len(vocab['t_2_i']))

11.2 s ± 2.45 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [57]:
df = pd.read_csv('reviews.csv')
X = np.asarray(df['review'])
y = np.asarray(df['rating'])

In [58]:
from sklearn.model_selection import train_test_split

In [59]:
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=0.3,shuffle=True)

In [60]:
vocabularyFromCorpus(Xtrain)

In [61]:
N = len(vocab['t_2_i'])
Xtrain_fM = corpusToFeatureMatrix_fast(Xtrain,N)
Xtest_fM = corpusToFeatureMatrix_fast(Xtest,N)

In [62]:
Xtrain_fM.shape

(39200, 7304)

In [63]:
Xtest_fM.shape

(16800, 7304)

In [73]:
#from sklearn.linear_model import LogisticRegression as clf
#from sklearn.naive_bayes import GaussianNB as clf
#from sklearn.ensemble import RandomForestClassifier as clf
from sklearn.svm import SVC as clf
from sklearn.metrics import confusion_matrix
import seaborn as sns
from matplotlib import pyplot as plt
sns.set()

In [None]:
M = clf().fit(Xtrain_fM,ytrain)

In [None]:
y_pred = M.predict(Xtest_fM)

In [None]:
mat = confusion_matrix(ytest,y_pred)
sns.heatmap(mat.T,square=True,annot=True,fmt='d',cbar=False,
           xticklabels=np.unique(y),yticklabels=np.unique(y))
plt.xlabel("True Label")
plt.ylabel("Predicted Label")

In [51]:
Xtrain.shape

(39200,)