In [1]:
import json as j
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,chi2

In [2]:
json_data = None
with open('yelp_academic_dataset_review.json') as data_file:
    lines = data_file.readlines()
    joined_lines = "[" + ",".join(lines) + "]"

    json_data = j.loads(joined_lines)
data = pd.DataFrame(json_data)

In [3]:
stemmer = SnowballStemmer('english')
words = stopwords.words("english")

In [17]:
data['cleaned'][:10000] = data['text'][:10000].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

In [26]:
X = data['cleaned'][:10000]
Y = data.stars[:10000]
Y.shape, X.shape

((10000,), (10000,))

In [27]:
X_train, X_test, Y_train,Y_test = train_test_split(X, Y, test_size=0.2)

In [28]:
pipeline = Pipeline(
    [('vect', TfidfVectorizer(ngram_range=(1,2),stop_words="english", sublinear_tf=True)),
    ('chi', SelectKBest(chi2, k=1000)),
    ('clf', LinearSVC(C=1.0,penalty='l1',max_iter=1000,dual=False))])

In [29]:
model = pipeline.fit(X_train,Y_train)

In [30]:
vectorizer = model.named_steps['vect']
chi = model.named_steps['chi']
clf = model.named_steps['clf']

In [31]:
feature_names = vectorizer.get_feature_names()
feature_names = [feature_names[i] for i in chi.get_support(indices=True)]
feature_names = np.asarray(feature_names)

In [32]:
target_names = ['1', '2', '3', '4', '5']
print("Top 10 keywords per class:")
for i, label in enumerate(target_names):
    top10 = np.argsort(clf.coef_[i])[-10:]
    print("%s: %s" % (label, " ".join(feature_names[top10])))

print("accuracy score: " + str(model.score(X_test, Y_test)))

Top 10 keywords per class:
1: avoid kfc refus messag aw nasti worst disgust rude horribl
2: say locat overpr good italian elsewher reason return inconsist mushi bland actual bad high hope
3: meh kid kid howev order minut okay decent eh littl overpr ok alright
4: bit nice pretti fun realli enjoy great good littl tasti enjoy
5: high recommend fantast favorit love knowledg perfect profession dr amaz best
accuracy score: 0.526
