In [1]:
#Let's start by importing all the necessary libraries we'll need for our model

import json as j
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

In [3]:
#Now that we have the libraries imported to our file, can start by reading data from the Yelp Review into an array

json_data = None
with open ('yelp_academic_dataset_review.json') as data_file:
    lines = data_file.readlines()
    joined_lines = "[" + ",".join(lines) + "]"
   
    json_data = j.loads(joined_lines)
    
#Put our data into a dataframe using pandas

json_data = pd.DataFrame(json_data)
    
    

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 4694: character maps to <undefined>

In [None]:
#We now have our data in a dataframe, but we should probably clean it first so we can ensure we're using useful data only

stemmer = SnowballStemmer('english') #Stems the words down to their root meaning
words = stopwords.word('english')    #Deletes words that are not commonly used throughout our data set

data['cleaned'] = data['text'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", "", x).split() if i not in words]).lower()

#Now that we've cleaned our data, let's assign it to our testing and training sets, along with their labels
X_train, X_test, y_train, y_test - train_test_split(data['cleaned'], data.stars, test_size = 0.2)

#data.stars is our target vector and data['cleaned'] is our features

In [None]:
#Let's now take our words and tokenize/vectorize them to make them usable by our model

pipeline = Pipeline([('vect', TfidfVectorizer(ngram_range = (1,2), stop_words = 'english', sublinear_tf = True)),
                    ('chi', SelectKBest(chi2, k = 10000))
                    ('clf', LinearSVC(C = 1.0, penalty = 'l1', max_iter = 3000, dual = False))])

#TfidfVectorizer generates a large matrix of our words following the "Bag of Words Model"
#Also "rates" words based upon the frequency with which the words appear in our data set as they will be considered 
#important
#ngram_range = (1,2) looks at each word and each pair of words and how they appear in our data set
#stop_words = 'english' will remove any non-englosh words as they will not be relevant to our data set


#Will now create a model by fitting it to our training data
model = pipeline.fit(X_train, y_train)

vectorizer = model.named_steps['vect']
chi = model.named_steps['chi']
clf = model.named_steps['clf']

feature_names = vectorizer.get_feature_names()
feature_names = [feature_names[i] for i in chi.get_support(indices=True)]
feature_names = np.asarray(feature_names)

target_names = ['1', '2', '3', '4', '5']
print("Top 10 keywords per class")
for i, label in enumerate(target_names):
    top10 = np.argsort(clf.coef_[i])[-10:]
    print("%s: %s" % label, " ".join(feature_names[top10]))
    
print("Accuracy score: " + str(model.score(X_test, y_test)))

print(model.predict(['that was an awesome place. Great food!']))