
### OBJECTIVE : USING TEXT CLASSIFICATION AND LOGISTIC REGRESSION MODEL TO PREDICT  RATING ON YELP                            USING JUST REVIEWS
#### METHOD 1: UNIGRAM
* Without removing stop words and without stemming
    * Binary
    * Counts
    * TFIDF
* Removing stop words and performing stemming
    * Binary
    * Counts
    * TFIDF

#### Author : Sonal Mendiratta

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import logging

In [2]:
review_restaurant = pd.read_json("review_restaurant.json")

In [3]:
sample_data = review_restaurant.sample(frac = 0.05,random_state=2000) 

In [4]:
def compute_accuracy(eval_items:list):
    correct=0
    total=0
    
    for item in eval_items:
        true_pred=item[0]
        machine_pred=set(item[1])
        
        for cat in true_pred:
            if cat in machine_pred:
                correct+=1
                break
    
    
    accuracy=correct/float(len(eval_items))
    return accuracy

In [5]:
def collect_preds(Y_test,Y_preds):
    """Collect all predictions and ground truth"""
    
    pred_gold_list=[[[Y_test[idx]],pred] for idx,pred in enumerate(Y_preds)]
    return pred_gold_list

In [6]:


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def extract_features(df,field,training_data,testing_data,type="binary"):
    """Extract features using different methods"""
    
    logging.info("Extracting features and creating vocabulary...")
    
    if "binary" in type:
        
        # BINARY FEATURE REPRESENTATION
        cv= CountVectorizer(binary=True, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        
        train_feature_set=cv.transform(training_data[field].values)
        test_feature_set=cv.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,cv
  
    elif "counts" in type:
        
        # COUNT BASED FEATURE REPRESENTATION
        cv= CountVectorizer(binary=False, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        
        train_feature_set=cv.transform(training_data[field].values)
        test_feature_set=cv.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,cv
    
    else:    
        
        # TF-IDF BASED FEATURE REPRESENTATION
        tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
        tfidf_vectorizer.fit_transform(training_data[field].values)
        
        train_feature_set=tfidf_vectorizer.transform(training_data[field].values)
        test_feature_set=tfidf_vectorizer.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,tfidf_vectorizer



In [7]:
def get_top_k_predictions(model,X_test,k):
    
    # get probabilities instead of predicted labels, since we want to collect top 3
    probs = model.predict_proba(X_test)

    # GET TOP K PREDICTIONS BY PROB - note these are just index
    best_n = np.argsort(probs, axis=1)[:,-k:]
    
    # GET CATEGORY OF PREDICTIONS
    preds=[[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]
    
    preds=[ item[::-1] for item in preds]
    
    return preds

In [15]:
def train_model(df,field,feature_rep,top_k):
    
    logging.info("Starting model training...")
    
#     # GET A TRAIN TEST SPLIT (set seed for consistent results)
    training_data, testing_data = train_test_split(df,random_state = 2000,)

    # GET LABELS
    Y_train=training_data['stars_reviews'].values
    Y_test=testing_data['stars_reviews'].values
     
    # GET FEATURES
    X_train,X_test,feature_transformer=extract_features(sample_data,field,training_data,testing_data,type=feature_rep)

    # INIT LOGISTIC REGRESSION CLASSIFIER
    logging.info("Training a Logistic Regression Model...")
    scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
    model=scikit_log_reg.fit(X_train,Y_train)

    # Confusion Matrix -- To be run for k=2
    predicted_values = model.predict(X_test)
    matrix = pd.crosstab(Y_test,predicted_values)
    
    
    # GET TOP K PREDICTIONS
    preds=get_top_k_predictions(model,X_test,top_k)
    
    # GET PREDICTED VALUES AND GROUND TRUTH INTO A LIST OF LISTS - for ease of evaluation
    eval_items=collect_preds(Y_test,preds)
    
    # GET EVALUATION NUMBERS ON TEST SET -- HOW DID WE DO?
    logging.info("Starting evaluation...")
    accuracy=compute_accuracy(eval_items)
    mrr_at_k=compute_mrr_at_k(eval_items)
    
    
    logging.info("Done training and evaluation.")
    
    return model,feature_transformer,accuracy,mrr_at_k,matrix

In [9]:
def _reciprocal_rank(true_labels: list, machine_preds: list):
    """Compute the reciprocal rank at cutoff k"""
    
    # add index to list only if machine predicted label exists in true labels
    tp_pos_list = [(idx + 1) for idx, r in enumerate(machine_preds) if r in true_labels]

    rr = 0
    if len(tp_pos_list) > 0:
        # for RR we need position of first correct item
        first_pos_list = tp_pos_list[0]
        
        # rr = 1/rank
        rr = 1 / float(first_pos_list)

    return rr

def compute_mrr_at_k(items:list):
    """Compute the MRR (average RR) at cutoff k"""
    rr_total = 0
    
    for item in items:   
        rr_at_k = _reciprocal_rank(item[0],item[1])
        rr_total = rr_total + rr_at_k
        mrr = rr_total / 1/float(len(items))

    return mrr

# Without removing Stopwords and without performing Stemming 

## Binary

#### Accuracy based on top 1 prediction

In [16]:
field='text'
feature_rep='binary'
top_k = 1


model,transformer,accuracy,mrr_at_k,matrix=train_model(sample_data,field=field,feature_rep=feature_rep,top_k=top_k)
print("\nAccuracy={0}".format(accuracy))

2019-11-28 09:39:14,717 : INFO : Starting model training...
2019-11-28 09:39:14,947 : INFO : Extracting features and creating vocabulary...
2019-11-28 09:39:45,955 : INFO : Training a Logistic Regression Model...


[LibLinear]

2019-11-28 09:48:45,592 : INFO : Starting evaluation...
2019-11-28 09:48:45,697 : INFO : Done training and evaluation.



Accuracy=0.5893642542982807


In [17]:
#Confusion Matrix
matrix

col_0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4408,951,412,209,275
2,1208,1412,1260,571,296
3,439,976,2424,2320,879
4,176,303,1465,6027,5695
5,117,123,409,3483,16683


In [18]:
test_features=transformer.transform(["I did not like it!"])
get_top_k_predictions(model,test_features,2)

[[1, 5]]

In [19]:
test_features=transformer.transform(["I loved it so much"])
get_top_k_predictions(model,test_features,2)

[[5, 4]]

In [20]:
test_features=transformer.transform(["it sucked"])
get_top_k_predictions(model,test_features,2)

[[1, 5]]

#### Accuracy based on top 2 predictions

In [21]:
field='text'
feature_rep='binary'
top_k = 2


model,transformer,accuracy,mrr_at_k,matrix=train_model(sample_data,field=field,feature_rep=feature_rep,top_k=top_k)
print("\nAccuracy={0}".format(accuracy))

2019-11-28 09:49:11,179 : INFO : Starting model training...
2019-11-28 09:49:11,420 : INFO : Extracting features and creating vocabulary...
2019-11-28 09:49:42,079 : INFO : Training a Logistic Regression Model...


[LibLinear]

2019-11-28 09:59:14,638 : INFO : Starting evaluation...
2019-11-28 09:59:14,750 : INFO : Done training and evaluation.



Accuracy=0.8408636545381847


In [22]:
#Confusion Matrix
matrix

col_0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4408,951,412,209,275
2,1208,1412,1260,571,296
3,439,976,2424,2320,879
4,176,303,1465,6027,5695
5,117,123,409,3483,16683


In [23]:
test_features=transformer.transform(["I did not like it!"])
get_top_k_predictions(model,test_features,2)

[[1, 5]]

In [24]:
test_features=transformer.transform(["I loved it so much"])
get_top_k_predictions(model,test_features,2)

[[5, 4]]

In [25]:
test_features=transformer.transform(["it sucked"])
get_top_k_predictions(model,test_features,2)

[[1, 5]]

## Counts

#### Accuracy based on top 1 prediction

In [27]:
field='text'
feature_rep='counts'
top_k = 1

model,transformer,accuracy,mrr_at_k,matrix=train_model(sample_data,field=field,feature_rep=feature_rep,top_k=top_k)
print("\nAccuracy={0}".format(accuracy))

2019-11-28 10:41:39,016 : INFO : Starting model training...
2019-11-28 10:41:39,238 : INFO : Extracting features and creating vocabulary...
2019-11-28 10:42:08,761 : INFO : Training a Logistic Regression Model...


[LibLinear]

2019-11-28 11:14:49,764 : INFO : Starting evaluation...
2019-11-28 11:14:49,842 : INFO : Done training and evaluation.



Accuracy=0.5899164143866263


In [28]:
#Confusion Matrix
matrix

col_0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4390,951,419,223,272
2,1239,1403,1265,546,294
3,460,963,2463,2290,862
4,187,310,1443,5971,5755
5,134,115,398,3412,16756


In [29]:
test_features=transformer.transform(["I did not like it!"])
get_top_k_predictions(model,test_features,2)

[[5, 4]]

In [30]:
test_features=transformer.transform(["I loved it so much"])
get_top_k_predictions(model,test_features,2)

[[5, 4]]

In [31]:
test_features=transformer.transform(["it sucked"])
get_top_k_predictions(model,test_features,2)

[[1, 5]]

#### Accuracy based on top 2 predictions

In [33]:
field='text'
feature_rep='counts'
top_k = 2

model,transformer,accuracy,mrr_at_k,matrix=train_model(sample_data,field=field,feature_rep=feature_rep,top_k=top_k)
print("\nAccuracy={0}".format(accuracy))

2019-11-28 11:29:06,409 : INFO : Starting model training...
2019-11-28 11:29:06,639 : INFO : Extracting features and creating vocabulary...
2019-11-28 11:29:36,816 : INFO : Training a Logistic Regression Model...


[LibLinear]

2019-11-28 12:06:20,253 : INFO : Starting evaluation...
2019-11-28 12:06:20,343 : INFO : Done training and evaluation.



Accuracy=0.8412634946021591


In [34]:
#Confusion Matrix
matrix

col_0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4390,951,419,223,272
2,1239,1403,1265,546,294
3,460,963,2463,2290,862
4,187,310,1443,5971,5755
5,134,115,398,3412,16756


In [35]:
test_features=transformer.transform(["I did not like it!"])
get_top_k_predictions(model,test_features,2)

[[5, 4]]

In [36]:
test_features=transformer.transform(["I loved it so much"])
get_top_k_predictions(model,test_features,2)

[[5, 4]]

In [37]:
test_features=transformer.transform(["it sucked"])
get_top_k_predictions(model,test_features,2)

[[1, 5]]

## TFIDF

#### Accuracy based on top 1 prediction

In [38]:
field='text'
feature_rep='tfidf'
top_k = 1

model,transformer,accuracy,mrr_at_k,matrix=train_model(sample_data,field=field,feature_rep=feature_rep,top_k=top_k)
print("\nAccuracy={0}".format(accuracy))

2019-11-28 12:06:20,508 : INFO : Starting model training...
2019-11-28 12:06:20,724 : INFO : Extracting features and creating vocabulary...
2019-11-28 12:06:50,970 : INFO : Training a Logistic Regression Model...


[LibLinear]

2019-11-28 12:07:46,167 : INFO : Starting evaluation...
2019-11-28 12:07:46,248 : INFO : Done training and evaluation.



Accuracy=0.625559300089488


In [39]:
#Confusion Matrix
matrix

col_0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4831,789,300,142,193
2,1347,1421,1277,479,223
3,440,760,2604,2530,704
4,144,146,1055,6671,5650
5,96,47,207,3137,17328


In [40]:
test_features=transformer.transform(["I did not like it!"])
get_top_k_predictions(model,test_features,2)

[[1, 2]]

In [41]:
test_features=transformer.transform(["I loved it so much"])
get_top_k_predictions(model,test_features,2)

[[5, 4]]

In [42]:
test_features=transformer.transform(["it sucked"])
get_top_k_predictions(model,test_features,2)

[[1, 2]]

#### Accuracy based on top 2 predictions

In [43]:
field='text'
feature_rep='tfidf'
top_k = 2

model,transformer,accuracy,mrr_at_k,matrix=train_model(sample_data,field=field,feature_rep=feature_rep,top_k=top_k)
print("\nAccuracy={0}".format(accuracy))

2019-11-28 12:07:46,410 : INFO : Starting model training...
2019-11-28 12:07:46,633 : INFO : Extracting features and creating vocabulary...
2019-11-28 12:08:20,262 : INFO : Training a Logistic Regression Model...


[LibLinear]

2019-11-28 12:09:13,794 : INFO : Starting evaluation...
2019-11-28 12:09:13,883 : INFO : Done training and evaluation.



Accuracy=0.8813427010148321


In [44]:
#Confusion Matrix
matrix

col_0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4831,789,300,142,193
2,1347,1421,1277,479,223
3,440,760,2604,2530,704
4,144,146,1055,6671,5650
5,96,47,207,3137,17328


In [45]:
test_features=transformer.transform(["I did not like it!"])
get_top_k_predictions(model,test_features,2)

[[1, 2]]

In [46]:
test_features=transformer.transform(["I loved it so much"])
get_top_k_predictions(model,test_features,2)

[[5, 4]]

In [47]:
test_features=transformer.transform(["it sucked"])
get_top_k_predictions(model,test_features,2)

[[1, 2]]

# Removing Stop words and Performing Stemming 

In [48]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stop = stopwords.words('english')
ps = PorterStemmer()

[nltk_data] Downloading package stopwords to /Users/sonal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
sample_data['text1'] = sample_data['text'].apply(lambda x: ' '.join([ps.stem(word) for word in x.split() if word not in (stop)]))

## Binary

#### Accuracy based on top 1 prediction

In [50]:
field='text1'
feature_rep='binary'
top_k = 1


model,transformer,accuracy,mrr_at_k,matrix=train_model(sample_data,field=field,feature_rep=feature_rep,top_k=top_k)
print("\nAccuracy={0}".format(accuracy))

2019-11-28 12:14:16,619 : INFO : Starting model training...
2019-11-28 12:14:17,392 : INFO : Extracting features and creating vocabulary...
2019-11-28 12:14:36,912 : INFO : Training a Logistic Regression Model...


[LibLinear]

2019-11-28 12:20:09,594 : INFO : Starting evaluation...
2019-11-28 12:20:09,676 : INFO : Done training and evaluation.



Accuracy=0.584813693570191


In [51]:
#Confusion Matrix
matrix

col_0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4394,905,406,238,312
2,1237,1343,1221,591,355
3,469,940,2356,2345,928
4,188,303,1453,5914,5808
5,115,117,432,3443,16708


In [52]:
test_features=transformer.transform(["I did not like it!"])
get_top_k_predictions(model,test_features,2)

[[5, 1]]

In [53]:
test_features=transformer.transform(["I loved it so much"])
get_top_k_predictions(model,test_features,2)

[[5, 3]]

In [54]:
test_features=transformer.transform(["it sucked"])
get_top_k_predictions(model,test_features,2)

[[1, 3]]

#### Accuracy based on top 2 predictions

In [55]:
field='text1'
feature_rep='binary'
top_k = 2


model,transformer,accuracy,mrr_at_k,matrix=train_model(sample_data,field=field,feature_rep=feature_rep,top_k=top_k)
print("\nAccuracy={0}".format(accuracy))

2019-11-28 12:20:09,840 : INFO : Starting model training...
2019-11-28 12:20:10,063 : INFO : Extracting features and creating vocabulary...
2019-11-28 12:20:29,065 : INFO : Training a Logistic Regression Model...


[LibLinear]

2019-11-28 12:31:28,244 : INFO : Starting evaluation...
2019-11-28 12:31:28,335 : INFO : Done training and evaluation.



Accuracy=0.8322766131642582


In [56]:
#Confusion Matrix
matrix

col_0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4394,905,406,238,312
2,1237,1343,1221,591,355
3,469,940,2356,2345,928
4,188,303,1453,5914,5808
5,115,117,432,3443,16708


In [57]:
test_features=transformer.transform(["I did not like it!"])
get_top_k_predictions(model,test_features,2)

[[5, 1]]

In [58]:
test_features=transformer.transform(["I loved it so much"])
get_top_k_predictions(model,test_features,2)

[[5, 3]]

In [59]:
test_features=transformer.transform(["it sucked"])
get_top_k_predictions(model,test_features,2)

[[1, 3]]

## Counts

#### Accuracy based on top 1 prediction

In [60]:
field='text1'
feature_rep='counts'
top_k = 1

model,transformer,accuracy,mrr_at_k,matrix=train_model(sample_data,field=field,feature_rep=feature_rep,top_k=top_k)
print("\nAccuracy={0}".format(accuracy))

2019-11-28 12:31:28,500 : INFO : Starting model training...
2019-11-28 12:31:28,718 : INFO : Extracting features and creating vocabulary...
2019-11-28 12:31:47,391 : INFO : Training a Logistic Regression Model...


[LibLinear]

2019-11-28 12:49:02,684 : INFO : Starting evaluation...
2019-11-28 12:49:02,765 : INFO : Done training and evaluation.



Accuracy=0.5842044134727061


In [61]:
#Confusion Matrix
matrix

col_0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4364,922,405,248,316
2,1256,1332,1218,583,358
3,472,963,2355,2279,969
4,204,303,1407,5869,5883
5,131,131,423,3367,16763


In [62]:
test_features=transformer.transform(["I did not like it!"])
get_top_k_predictions(model,test_features,2)

[[5, 4]]

In [63]:
test_features=transformer.transform(["I loved it so much"])
get_top_k_predictions(model,test_features,2)

[[5, 3]]

In [65]:
test_features=transformer.transform(["it sucked"])
get_top_k_predictions(model,test_features,2)

[[1, 3]]

#### Accuracy based on top 2 predictions

In [66]:
field='text1'
feature_rep='counts'
top_k = 2

model,transformer,accuracy,mrr_at_k,matrix=train_model(sample_data,field=field,feature_rep=feature_rep,top_k=top_k)
print("\nAccuracy={0}".format(accuracy))

2019-11-28 12:49:45,560 : INFO : Starting model training...
2019-11-28 12:49:45,784 : INFO : Extracting features and creating vocabulary...
2019-11-28 12:50:04,968 : INFO : Training a Logistic Regression Model...


[LibLinear]

2019-11-28 13:08:45,191 : INFO : Starting evaluation...
2019-11-28 13:08:45,282 : INFO : Done training and evaluation.



Accuracy=0.8341044534567126


In [67]:
#Confusion Matrix
matrix

col_0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4364,922,405,248,316
2,1256,1332,1218,583,358
3,472,963,2355,2279,969
4,204,303,1407,5869,5883
5,131,131,423,3367,16763


In [68]:
test_features=transformer.transform(["I did not like it!"])
get_top_k_predictions(model,test_features,2)

[[5, 4]]

In [69]:
test_features=transformer.transform(["I loved it so much"])
get_top_k_predictions(model,test_features,2)

[[5, 3]]

In [71]:
test_features=transformer.transform(["it sucked"])
get_top_k_predictions(model,test_features,2)

[[1, 3]]

## TFIDF


#### Accuracy based on top 1 prediction

In [72]:
field='text1'
feature_rep='tfidf'
top_k = 1

model,transformer,accuracy,mrr_at_k,matrix=train_model(sample_data,field=field,feature_rep=feature_rep,top_k=top_k)
print("\nAccuracy={0}".format(accuracy))

2019-11-28 13:09:30,073 : INFO : Starting model training...
2019-11-28 13:09:30,294 : INFO : Extracting features and creating vocabulary...
2019-11-28 13:09:50,812 : INFO : Training a Logistic Regression Model...


[LibLinear]

2019-11-28 13:10:29,537 : INFO : Starting evaluation...
2019-11-28 13:10:29,626 : INFO : Done training and evaluation.



Accuracy=0.6159821785571485


In [73]:
#Confusion Matrix
matrix

col_0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4780,741,328,171,235
2,1339,1300,1258,535,315
3,458,745,2469,2545,821
4,155,158,1072,6533,5748
5,112,53,229,3151,17270


In [74]:
test_features=transformer.transform(["I did not like it!"])
get_top_k_predictions(model,test_features,2)

[[1, 2]]

In [75]:
test_features=transformer.transform(["I loved it so much"])
get_top_k_predictions(model,test_features,2)

[[5, 3]]

In [76]:
test_features=transformer.transform(["it sucked"])
get_top_k_predictions(model,test_features,2)

[[1, 3]]

#### Accuracy based on top 2 predictions

In [77]:
field='text1'
feature_rep='tfidf'
top_k = 2

model,transformer,accuracy,mrr_at_k,matrix=train_model(sample_data,field=field,feature_rep=feature_rep,top_k=top_k)
print("\nAccuracy={0}".format(accuracy))

2019-11-28 13:10:29,787 : INFO : Starting model training...
2019-11-28 13:10:30,099 : INFO : Extracting features and creating vocabulary...
2019-11-28 13:10:50,975 : INFO : Training a Logistic Regression Model...


[LibLinear]

2019-11-28 13:11:29,165 : INFO : Starting evaluation...
2019-11-28 13:11:29,260 : INFO : Done training and evaluation.



Accuracy=0.8720511795281887


In [78]:
#Confusion Matrix
matrix

col_0,1,2,3,4,5
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4780,741,328,171,235
2,1339,1300,1258,535,315
3,458,745,2469,2545,821
4,155,158,1072,6533,5748
5,112,53,229,3151,17270


In [79]:
test_features=transformer.transform(["I did not like it!"])
get_top_k_predictions(model,test_features,2)

[[1, 2]]

In [80]:
test_features=transformer.transform(["I loved it so much"])
get_top_k_predictions(model,test_features,2)

[[5, 3]]

In [81]:
test_features=transformer.transform(["it sucked"])
get_top_k_predictions(model,test_features,2)

[[1, 3]]