references -
1) https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f
2) https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [33]:
import numpy as np 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import json
import re
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def json_reader(fname):
    """
        Read multiple json files
        Args:
            fname: str: input file
        Returns:
            generator: iterator over documents 
    """
    for line in open(fname, mode="r"):
        yield json.loads(line)
# code courtesy - Prof. Parag Singla COL774

In [4]:
dat={}
dat['review']=[]
dat['stars']=[]

In [5]:
for entry in json_reader("/content/drive/MyDrive/train.json"):
    s = entry['text']
    s=re.sub(r'[^\w\s]', '', s)
    #print(s)
    cat = entry['stars']
    dat['review'].append(s)
    dat['stars'].append(cat)

In [6]:
df = pd.DataFrame(dat)
X_train= df['review']
y_train = df['stars']

In [7]:
print(len(dat['review']))
print(df)

534872
                                                   review  stars
0       Perfect on off hrs like all gyms Staff isnt to...    3.0
1       Awesome  I leave rare reviews only on off the ...    5.0
2       I went to the Boulevard Mall today for the fir...    5.0
3       Came here for the Holiday Lights Train Rides a...    4.0
4       This is one of our regular restaurants And jud...    5.0
...                                                   ...    ...
534867  So Im told this place started in Minnesota and...    4.0
534868  I had ordered take away from this resturant\nI...    1.0
534869  Went here for a late dinner Loved the ambience...    2.0
534870  I love the variety of soup base\n\n1 Choose so...    4.0
534871  Vehicle Purchased  2013 Honda CRV EXL 51000 mi...    4.0

[534872 rows x 2 columns]


In [22]:
text_clf = Pipeline([
     ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
 ])

In [23]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [27]:
#test accuracy 
correct =0
samples =0
for entry in json_reader("/content/drive/MyDrive/test.json"):
    samples += 1
    s = entry['text']
    s=re.sub(r'[^\w\s]', '', s)
    #print(s)
    cat = entry['stars']
    w = [s]
    pred = text_clf.predict(w)
    #print(pred)
    #print(cat)
    if cat == pred :
      correct +=1
    #print(correct)
print(correct/samples)

0.5373547315993359


Setting up linear SVC 

In [30]:
clf_svc = Pipeline([
     ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('linearsvc', LinearSVC()),
 ])

In [31]:
clf_svc.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('linearsvc',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [32]:
correct_svm =0
samples =0
for entry in json_reader("/content/drive/MyDrive/test.json"):
    samples += 1
    s = entry['text']
    s=re.sub(r'[^\w\s]', '', s)
    #print(s)
    cat = entry['stars']
    w = [s]
    pred = clf_svc.predict(w)
    #print(pred)
    #print(cat)
    if cat == pred :
      correct_svm +=1
    #print(correct)
print(correct_svm/samples)

0.6696031947830509


In [35]:
clf_sgd = Pipeline([
     ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('sgd', SGDClassifier()),
 ])

In [36]:
clf_sgd.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000,

In [37]:
correct_sgd =0
samples =0
for entry in json_reader("/content/drive/MyDrive/test.json"):
    samples += 1
    s = entry['text']
    s=re.sub(r'[^\w\s]', '', s)
    #print(s)
    cat = entry['stars']
    w = [s]
    pred = clf_sgd.predict(w)
    #print(pred)
    #print(cat)
    if cat == pred :
      correct_sgd +=1
    #print(correct)
print(correct_sgd/samples)

0.6324877727755426
