In [7]:
#First we download or import all the required packages

import pandas as pd
import nltk
import re
import spacy
nltk.download('stopwords')
from nltk.corpus import stopwords
#from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
eng = spacy.load('en_core_web_lg')

[nltk_data] Downloading package stopwords to C:\Users\Vikram
[nltk_data]     Jirgale/nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [3]:
#lets load a sample restaurant reviews dataset

ds = pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t')
ds.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


Lets make a corpus (words dataset) which will contain all the reviews in a format that can be easily analyzed. 

In [8]:
# Create an empty corpus
corpus_spacy = []
for i in ds['Review']:
    review = re.sub('[^a-zA-z]',' ',i)
    
    # Convert all words in the corpus to lower case so that it can be easily analyzed
    review = review.lower()

    # Load the stopwords (words that dont carry much meaning in NLP context)
    stwords = stopwords.words('english')

    # We cannot remove the word 'not' from the list of stop words as not carries significant meaning
    stwords.remove("not")

    # Load only the reviews in the corpus

    review = eng(review)
    
    #  Remove the stop words
    # here we also convert words into their base form (lematization) so they are not considered different
    # EG: loved, love, and loving will all be coverted to love.
    # We use token.lemma_ func for this
    
    review = [token.lemma_ for token in review if str(token) not in stwords]  
    review = ' '.join(review)
    corpus_spacy.append(review)
print(corpus_spacy)



In [14]:
# Lets check whether there was any data loss during the process:

# Columns in data frame
print("Number of rows in panda dataframe= ",len(ds))
print("Number of entries in corpus = ", len(corpus_spacy))


Number of rows in panda dataframe=  1000
Number of entries in corpus =  1000


Now we are done with text form of the data. Machines cannot understand words but they can understand data in numerical form.
We will thus use a Vectorizer method to convert this textual data into word vectors.

The method will assign unique vector value to each word.


In [15]:
cv = CountVectorizer()
X = cv.fit_transform(corpus_spacy).toarray()
y = ds.iloc[:,1].values

The words will be first compared with a pre-existing dataset of words and their assigned vector values.


Let's train the model to predict sentiment value of the reviews and 
compare it with preassigned values to calculate the accuracy and precision.

In [18]:
# Training and testing data split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [19]:
# We will use a Logistic regression model from Scikit

# classifier is our model
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

# Lets calculate the confusion matrix for the data:

cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix:\n",cm)

#accuracy
score1 = accuracy_score(y_test,y_pred)
#precision
score2 = precision_score(y_test,y_pred)

print("\n")
print("Accuracy is ",round(score1*100,2),"%")
print("Precision is ",round(score2,2))


Confusion Matrix:
 [[82 14]
 [30 74]]


Accuracy is  78.0 %
Precision is  0.84


We got a 78 % accuracy, which is pretty good. Lets check the  results of the model by checking the accuracy and prediction with a small new data

To check how spacy helps us in preprocessing and hence machine learning,

We will first conduct the test with un_processed data then with processed data.

In [23]:
new_dataset = ['Food was really good!','I am loving it','I hate waiting in this restaurant','Food was greasy']
new_trans = cv.transform(new_dataset)
classifier.predict(new_trans)

array([1, 0, 0, 0], dtype=int64)

As you can see the first 2 statements were positive but we got the prediction for second statement wrong.

Now lets try using spaCy

In [25]:
# Lets create an empty doc object
test_spacy = []

#repeat the preprocessing we did for original dataset
for i in new_dataset:
    review = re.sub('[^a-zA-z]',' ',i)
    review = review.lower()
    stwords = stopwords.words('english')
    stwords.remove("not")
    review = eng(review)
    review = [token.lemma_ for token in review if str(token) not in stwords]
    review = ' '.join(review)
    test_spacy.append(review)
print(test_spacy)

['food really good', 'love', 'hate wait restaurant', 'food greasy']


Lets check the results:

In [26]:
new_trans = cv.transform(test_spacy)
classifier.predict(new_trans)

array([1, 1, 0, 0], dtype=int64)

SpaCy gave us the accurate / better output 

Conclusion:

To recap we did the following steps today:

1) Load a sample text data-base which consists of reviews and their sentiment values in the form 1 = positive, 0 = negative
2) Preprocess this dataset using spacy and converting into a doc object through a spaCy pipeline
3) Use this data as the input for a machine learning model using Scikit
4) Test the accuracy and precision of these predictions
5) Test the model with a new unforeseen data-set, first without preprocessing and with preprocessing

SpaCy is a very usefull package which provides us with industry grade NLP techniques and tools.
Spacy can provide with many useful methods like tokenization, lematization ets through its use of pipelines.
Test data preprocessed with SpaCy is better data for training and testing purposes for a ML model.  