In [13]:
import nltk 
import simplejson as json
from sklearn.feature_extraction.text import * 
from sklearn.model_selection import train_test_split 
from sklearn import linear_model 
from sklearn import metrics 
import warnings 
warnings.simplefilter('ignore')

In [14]:
# Creates a bag of words (BOW) representation from text documents, using the Vectorizer function in scikit-learn
# Inputs:
#  - file name 
#  - the min_pos and max_neg parameters
#  - all reviews with scores => min_pos = 4 are labeled "1"  
#  - all reviews with scores <= max_neg = 2 ae labeled "0" 
#  - this creates a simple set of labels for binary classification, ignoring the neutral (score = 3) reviews
# 
#  The function extracts the text and scores for each review from the JSON data
#  It then tokenizes and creates a sparse bag-of-words array using scikit-learn vectorizer function
#  The number of rows in the array is the number of reviews with scores <=2 or >=4
#  The number of columns in the array is the number of terms in the vocabulary
def create_bow_from_reviews(filename, min_pos=4, max_neg=2): 
    
    print('\nLoading the file: \n', filename) 
    with open(filename, 'r') as jfile:
        data = json.load(jfile)
    print('\nTotal number of reviews extracted =', len(data) )

    text = []
    Y = []
    lengths = []
    print('\nExtracting tokens from each review.....(can be slow for a large number of reviews)......')   
    for d in data:
        # keep only the text and label
        review = d["text"]
        stars = d["stars"]

        # simple logic to generate a binary score for each review
        if stars >= min_pos: score = 1
        elif stars <= max_neg: score = 0
        else: continue # Ignore 3 stars reviews (neutral)

        text.append(review)   
        Y.append(score)

    # creates an instance of a CountVectorizer, using
    # (1) the standard 'english' stopword set 
    # (2) only keeping terms in the vocabulary that occur in at least 1% of documents
    # (3) allowing both unigrams and bigrams in the vocabulary
    vectorizer = CountVectorizer(stop_words='english', min_df=0.01, ngram_range=(1, 2))

    # creates a sparse BOW array from 'text' using vectorizer  
    X = vectorizer.fit_transform(text)

    print('Data shape: ', X.shape)

    return X, Y, vectorizer

In [15]:
def logistic_classification(X, Y, test_fraction): 
    # This function creates and returns a Logistic Classifier for text using l2 penalty regularizer.
    # Accuracy is computed for both training and test sets to fine tune the model and to reduce bias and variance problems.
    # Parameters:
    # X: A sparse Bag of Words created from text in reviews using CountVectorizer.
    # Y: Binary labels (0 or 1) corresponding to each data in X.
    # test_fraction: a real value representing split size for test. (1-test_fraction) will be used for training.

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_fraction, random_state=42)
    #  set the state of the random number generator so that we get the same results across runs when testing our code
    random_state = 0

    print('Number of training examples: ', X_train.shape[0])
    print('Number of testing examples: ', X_test.shape[0])   
    print('Vocabulary size: ', X_train.shape[1]) 


    # Specify the logistic classifier model with an l2 penalty for regularization and with fit_intercept turned on
    classifier = linear_model.LogisticRegression(random_state = random_state, penalty="l2", fit_intercept=True)

    # Train a logistic regression classifier and evaluate accuracy on the training data
    print('\nTraining a model with', X_train.shape[0], 'examples.....')
    classifier.fit(X_train, Y_train)
    train_predictions = classifier.predict(X_train)
    train_accuracy = classifier.score(X_train, Y_train)
    print('\nTraining:')
    print(' accuracy:',format( 100*train_accuracy , '.2f') ) 

    # Compute and print accuracy and AUC on the test data
    print('\nTesting: ')
    test_predictions = classifier.predict(X_test)
    test_accuracy = classifier.score(X_test, Y_test)
    print(' accuracy:', format( 100*test_accuracy , '.2f') )

    class_probabilities = classifier.predict_proba(X_test)
    test_auc_score = metrics.roc_auc_score(Y_test, test_predictions)
    print(' AUC value:', format( 100*test_auc_score , '.2f') )

    return(classifier)

In [16]:
def most_significant_terms(classifier, vectorizer, K):
    topK_pos_weights, topK_neg_weights, topK_pos_terms, topK_neg_terms = [], [], [], []

    weights = classifier.coef_ # Get feature weights from classifier
    weight_pairs = zip(vectorizer.get_feature_names(), weights[0]) # Create pairs of (feature, weight)
    weight_pairs = sorted(weight_pairs, key=lambda x: x[1]) # Sort by weight

    # Print (term, weight) in order of largest weight first
    print("\nPositive Weights")
    for w in reversed(weight_pairs[-K:]):
        topK_pos_terms.append(w[0])
        topK_pos_weights.append(w[1])
        print(w[0], format(w[1] , '.3f'))

    # Print (term, weight) in order of most negative values first
    print("\nNegative Weights")
    for w in weight_pairs[:K]:
        topK_neg_terms.append(w[0])
        topK_neg_weights.append(w[1])
        print(w[0], format(w[1] , '.3f'))

    return(topK_pos_weights, topK_neg_weights, topK_pos_terms, topK_neg_terms)

In [17]:
# read in the review text and tokenize the text in each review
X, Y , vectorizer_BOW = create_bow_from_reviews('yelp_reviews.json')

# run a logistic classifier on the reviews, specifying the fraction to be used for testing  
test_fraction = 0.3
logistic_classifier = logistic_classification(X, Y,test_fraction) 


Loading the file: 
 yelp_reviews.json

Total number of reviews extracted = 20000

Extracting tokens from each review.....(can be slow for a large number of reviews)......
Data shape:  (17501, 849)
Number of training examples:  12250
Number of testing examples:  5251
Vocabulary size:  849

Training a model with 12250 examples.....

Training:
 accuracy: 94.52

Testing: 
 accuracy: 90.94
 AUC value: 85.73


In [18]:
# print out and return the most significant positive and negative weights (and associated terms) 
topK_pos_weights, topK_neg_weights, topK_pos_terms, topK_neg_terms = most_significant_terms(logistic_classifier, vectorizer_BOW, K=20)


Positive Weights
pleased 2.265
excellent 2.189
delicious 2.058
awesome 2.028
enjoyed 2.023
generous 1.888
yum 1.872
fantastic 1.834
amazing 1.830
gem 1.821
perfect 1.806
great service 1.693
unique 1.659
yummy 1.655
wonderful 1.597
world 1.572
fabulous 1.539
liked 1.515
favorite 1.493
highly recommend 1.485

Negative Weights
worst -3.246
soggy -2.782
horrible -2.550
disappointing -2.383
terrible -2.357
bland -2.176
rude -2.165
waste -2.017
unfortunately -1.716
dry -1.706
disappointed -1.696
excited -1.638
dirty -1.532
poor -1.432
salty -1.367
okay -1.361
happened -1.312
taking -1.293
ok -1.291
fine -1.284
