In [11]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

In [12]:
df = pd.read_csv("yelp.csv")
# df
texts = df['text']
# texts
stars = df['stars']
# stars

In [13]:
def balance_classes(xs, ys):
    
    freqs = Counter(ys)

    # the least common class is the maximum number we want for all classes
    max_allowable = freqs.most_common()[-1][1]
    num_added = {clss: 0 for clss in freqs.keys()}
    new_ys = []
    new_xs = []
    for i, y in enumerate(ys):
        if num_added[y] < max_allowable:
            new_ys.append(y)
            new_xs.append(xs[i])
            num_added[y] += 1
    return new_xs, new_ys

In [14]:
print(Counter(stars))
balanced_x, balanced_y = balance_classes(texts, stars)
print(Counter(balanced_y))

Counter({4: 3526, 5: 3337, 3: 1461, 2: 927, 1: 749})
Counter({5: 749, 4: 749, 2: 749, 3: 749, 1: 749})


In [15]:
# This vectorizer breaks text into single words and bi-grams
# and then calculates the TF-IDF representation
vectorizer = TfidfVectorizer(ngram_range=(1,2))

# the 'fit' builds up the vocabulary from all the reviews
# while the 'transform' step turns each indivdual text into
# a matrix of numbers.
vectors = vectorizer.fit_transform(balanced_x)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(vectors, balanced_y, test_size=0.33, random_state=42)

In [17]:
# initialise the SVM classifier
classifier = LinearSVC()

# train the classifier
classifier.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [18]:
preds = classifier.predict(X_test)
print(list(preds[:10]))
print(y_test[:10])

[2, 3, 3, 1, 2, 1, 1, 4, 5, 4]
[2, 4, 3, 4, 2, 1, 4, 4, 5, 3]


In [19]:
print(accuracy_score(y_test, preds))

0.5186084142394822


In [20]:
confusion_matrix(y_test, preds)

array([[174,  53,   9,   2,   8],
       [ 75,  93,  50,  12,  13],
       [ 30,  41, 105,  49,  29],
       [  7,  10,  43,  94, 100],
       [  8,   2,  14,  40, 175]])