In [1]:
%run -i "../util/util_simple_classifier.ipynb"

In [2]:
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

In [14]:
# Load cleaned data from disk
train_df = pd.read_json("../data/rotten_tomatoes_train.json")
test_df = pd.read_json("../data/rotten_tomatoes_test.json")

In [4]:
# Create the positive and negative word lists
# Only from training data
positive_train_words = train_df[train_df["label"] == 1].text.sum()
negative_train_words = train_df[train_df["label"] == 0].text.sum()
word_intersection = set(positive_train_words) & set(negative_train_words)
positive_filtered = list(set(positive_train_words) - word_intersection)
negative_filtered = list(set(negative_train_words) - word_intersection)

In [5]:
def create_vectorizers(word_lists):
    vectorizers = []
    for word_list in word_lists:
        vectorizer = CountVectorizer(vocabulary=word_list)
        vectorizers.append(vectorizer)
    return vectorizers

In [6]:
vectorizers = create_vectorizers([negative_filtered, positive_filtered])

In [7]:
def vectorize(text_list, vectorizers):
    text = " ".join(text_list)
    scores = []
    for vectorizer in vectorizers:
        output = vectorizer.transform([text])
        output_sum = sum(output.todense().tolist()[0])
        scores.append(output_sum)
    return scores

In [8]:
def classify(score_list):
    return max(enumerate(score_list),key=lambda x: x[1])[0]

In [9]:
train_df["prediction"] = train_df["text"].apply(lambda x: classify(vectorize(x, vectorizers)))
print(train_df)

                                                   text  label lang  \
0     [rock, destined, 21st, century, new, conan, go...      1   en   
1     [gorgeously, elaborate, continuation, lord, ri...      1   en   
2                        [effective, too-tepid, biopic]      1   en   
3     [sometimes, like, go, movies, fun, wasabi, goo...      1   en   
4     [emerges, something, rare, issue, movie, hones...      1   en   
...                                                 ...    ...  ...   
8525  [enjoyment, hinge, personal, threshold, watchi...      0   en   
8526  [legendary, shlockmeister, ed, wood, ever, mad...      0   en   
8527  [hardly, nuanced, portrait, young, woman, brea...      0   en   
8528        [interminably, bleak, say, nothing, boring]      0   en   
8529  [things, really, get, weird, though, particula...      0   en   

      prediction  
0              1  
1              1  
2              0  
3              0  
4              1  
...          ...  
8525          

In [10]:
# Training metrics
print(classification_report(train_df['label'], train_df['prediction']))

              precision    recall  f1-score   support

           0       0.79      0.99      0.88      4194
           1       0.99      0.74      0.85      4170

    accuracy                           0.87      8364
   macro avg       0.89      0.87      0.86      8364
weighted avg       0.89      0.87      0.86      8364



In [11]:
# Test metrics
test_df["prediction"] = test_df["text"].apply(lambda x: classify(vectorize(x, vectorizers)))
print(classification_report(test_df['label'], test_df['prediction']))

              precision    recall  f1-score   support

           0       0.59      0.81      0.68       523
           1       0.70      0.43      0.53       524

    accuracy                           0.62      1047
   macro avg       0.64      0.62      0.61      1047
weighted avg       0.64      0.62      0.61      1047



In [12]:
# Compare to performance on uncleaned data
(train_df, test_df) = load_train_test_dataset_pd("train", "test")
train_df["text"] = train_df["text"].apply(word_tokenize)
test_df["text"] = test_df["text"].apply(word_tokenize)
positive_train_words = train_df[train_df["label"] == 1].text.sum()
negative_train_words = train_df[train_df["label"] == 0].text.sum()
word_intersection = set(positive_train_words) & set(negative_train_words)
positive_filtered = list(set(positive_train_words) - word_intersection)
negative_filtered = list(set(negative_train_words) - word_intersection)
vectorizers = create_vectorizers([negative_filtered, positive_filtered])
train_df["prediction"] = train_df["text"].apply(lambda x: classify(vectorize(x, vectorizers)))
test_df["prediction"] = test_df["text"].apply(lambda x: classify(vectorize(x, vectorizers)))
print(classification_report(train_df['label'], train_df['prediction']))
print(classification_report(test_df['label'], test_df['prediction']))

              precision    recall  f1-score   support

           0       0.79      0.99      0.88      4265
           1       0.99      0.74      0.84      4265

    accuracy                           0.86      8530
   macro avg       0.89      0.86      0.86      8530
weighted avg       0.89      0.86      0.86      8530

              precision    recall  f1-score   support

           0       0.59      0.81      0.68       533
           1       0.70      0.44      0.54       533

    accuracy                           0.62      1066
   macro avg       0.65      0.62      0.61      1066
weighted avg       0.65      0.62      0.61      1066

