In [1]:
import pandas as pd
import os
from nltk.corpus import stopwords
import nltk.data
import logging
import numpy as np  # Make sure that numpy is imported
import fastText

from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [2]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,), dtype="float32")
    #
    nwords = 0.
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        nwords = nwords + 1.
        featureVec = np.add(featureVec, model.get_word_vector(word))

    if nwords == 0.:
        print(nwords)
        print(words)
    #
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec, nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate
    # the average feature vector for each one and return a 2D numpy array
    #
    # Initialize a counter
    counter = 0.
    #
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")
    #
    # Loop through the reviews
    for review in reviews:
        #
        # Print a status message every 1000th review
        if counter % 5000. == 0.:
            print "Review %d of %d" % (counter, len(reviews))
        #
        # Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[int(counter)] = makeFeatureVec(review, model, num_features)
        #
        # Increment the counter
        counter = counter + 1.
    return reviewFeatureVecs

In [3]:
# Load clean reviews
BASE_DIR = '../input/'
LABELED_TRAIN_DF = BASE_DIR + 'labeled_train_clean_reviews.csv'
TEST_DF = BASE_DIR + 'test_clean_reviews.csv'
labeled_train = pd.read_csv(LABELED_TRAIN_DF, header = 0)
test = pd.read_csv(TEST_DF, header = 0)
# Prevent 'float' object has no attribute 'lower' error in keras tokenizer
# https://stackoverflow.com/questions/34724246/attributeerror-float-object-has-no-attribute-lower
labeled_train["review"] = labeled_train["review"].astype(str)
test["review"] = test["review"].astype(str)
print "Read %d labeled train reviews and %d test reviews" % (labeled_train["review"].size, test["review"].size)

Read 25000 labeled train reviews and 25000 test reviews


In [4]:
labeled_train_sentiment = labeled_train["sentiment"].tolist()
print(labeled_train_sentiment[0:10])

[1, 1, 0, 0, 1, 1, 0, 0, 0, 1]


In [5]:
def reviews_to_wordlist(reviews):
    wordlist = []
    for review in reviews:
        words = review.split()
        words = [word.strip() for word in words]
        wordlist.append(words)
    return wordlist

In [6]:
labeled_train_reviews = labeled_train["review"].tolist()
test_reviews = test["review"].tolist()
labeled_train_reviews = reviews_to_wordlist(labeled_train_reviews)
test_reviews = reviews_to_wordlist(test_reviews)
print(labeled_train_reviews[0][10])
print(test_reviews[0][10])

mj
mortality


In [7]:
num_features = 300

In [8]:
pretrained_model = fastText.load_model('../input/cc.en.300.bin')
print "Creating average feature vecs for training reviews"
pretrained_train_vecs = getAvgFeatureVecs(labeled_train_reviews, pretrained_model, num_features)

print "Creating average feature vecs for test reviews"
pretrained_test_vecs = getAvgFeatureVecs(test_reviews, pretrained_model, num_features)

Creating average feature vecs for training reviews
Review 0 of 25000
Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000
Creating average feature vecs for test reviews
Review 0 of 25000
Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000


In [9]:
my_model = fastText.load_model('../input/fasttext_300features_40minwords_10context.bin')
print "Creating my average feature vecs for training reviews"
my_train_vecs = getAvgFeatureVecs(labeled_train_reviews, my_model, num_features)

print "Creating my average feature vecs for test reviews"
my_test_vecs = getAvgFeatureVecs(test_reviews, my_model, num_features)

Creating my average feature vecs for training reviews
Review 0 of 25000
Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000
Creating my average feature vecs for test reviews
Review 0 of 25000
Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000


In [10]:
supervised_wiki_model = fastText.load_model('../input/fasttext_300features_40minwords_10context_pretrained_wiki.bin')
print "Creating supervised average feature vecs for training reviews"
supervised_wiki_train_vecs = getAvgFeatureVecs(labeled_train_reviews, supervised_wiki_model, num_features)

print "Creating supervised average feature vecs for test reviews"
supervised_wiki_test_vecs = getAvgFeatureVecs(test_reviews, supervised_wiki_model, num_features)

Creating supervised average feature vecs for training reviews
Review 0 of 25000
Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000
Creating supervised average feature vecs for test reviews
Review 0 of 25000
Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000


In [8]:
supervised_cc_model = fastText.load_model('../input/fasttext_300features_40minwords_10context_pretrained_cc.bin')
print "Creating supervised average feature vecs for training reviews"
supervised_cc_train_vecs = getAvgFeatureVecs(labeled_train_reviews, supervised_cc_model, num_features)

print "Creating supervised average feature vecs for test reviews"
supervised_cc_test_vecs = getAvgFeatureVecs(test_reviews, supervised_cc_model, num_features)

KeyboardInterrupt: 

In [11]:
pretrained_forest = RandomForestClassifier(n_estimators=100)
pretrained_forest = pretrained_forest.fit(pretrained_train_vecs, labeled_train_sentiment)
pretrained_result = pretrained_forest.predict(pretrained_test_vecs)

# Write the test results
# output = pd.DataFrame(data={"id": test["id"], "sentiment": result})
# output.to_csv(os.path.join(os.path.dirname(__file__), '../', 'output', "FT_AverageVectors.csv"), index=False, quoting=3)
# print "Wrote FT_AverageVectors.csv"

In [12]:
my_forest = RandomForestClassifier(n_estimators=100)
my_forest = my_forest.fit(my_train_vecs, labeled_train_sentiment)
my_result = my_forest.predict(my_test_vecs)

In [13]:
supervised_wiki_forest = RandomForestClassifier(n_estimators=100)
supervised_wiki_forest = supervised_wiki_forest.fit(supervised_wiki_train_vecs, labeled_train_sentiment)
supervised_wiki_result = supervised_wiki_forest.predict(supervised_wiki_test_vecs)

In [None]:
supervised_cc_forest = RandomForestClassifier(n_estimators=100)
supervised_cc_forest = supervised_cc_forest.fit(supervised_cc_train_vecs, labeled_train_sentiment)
supervised_cc_result = supervised_cc_forest.predict(supervised_cc_test_vecs)

In [14]:
test["sentiment"] = test["id"].map(lambda x: 1 if int(x.strip('"').split("_")[1]) >= 5 else 0)
test_sentiment = test["sentiment"].tolist()

In [15]:
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
def print_auc_score(model_name, test_pred):
    print("The AUC score for %s is : %.4f." % (model_name, roc_auc_score(test_sentiment, test_pred)))
    return

In [16]:
print_auc_score("Fasttext pretrained vector random forest", pretrained_result)
print_auc_score("Fasttext my vector random forest", my_result)
print_auc_score("Fasttext supervised vector random forest", supervised_wiki_result)

The AUC score for Fasttext pretrained vector random forest is : 0.7582.
The AUC score for Fasttext my vector random forest is : 0.8308.
The AUC score for Fasttext supervised vector random forest is : 0.8260.


In [None]:
print_auc_score("Fasttext supervised cc vector random forest", supervised_cc_result)