## Step 0: Library Imports
Importing libraries and modules as required in order to carry out our analysis

In [0]:
!pip install bs4
!pip install nltk
!pip install sklearn

You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Out[2]: True

In [0]:
nltk.download('stopwords')

from nltk.corpus import stopwords

stop_en = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Step 1: Data Loading and pre-processing

In [0]:
# ************ function to remove load our CSV into a dataframe ************
def load_csv_to_df(file_loc):
  # n limits # of rows loaded

  # File location and type
  file_location = file_loc
  file_type = "csv"

  # CSV options
  infer_schema = "true"
  first_row_is_header = "true"
  delimiter = ","

  # The applied options are for CSV files. For other file types, these will be ignored.
  df = spark.read.format(file_type) \
    .option("inferSchema", infer_schema) \
    .option("header", first_row_is_header) \
    .option("sep", delimiter) \
    .option("multiLine",True) \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .load(file_location)
  return df

In [0]:
# ********** Function to preprocess our dataframe **********
from bs4 import BeautifulSoup
import re
from nltk.stem import SnowballStemmer
from pyspark.sql.types import StringType, ArrayType

@udf(returnType=ArrayType(StringType()))
def preprocess_body(body):
  # note: not HTML therefore beautiful soup not technically required
  body = BeautifulSoup(body)
  
  # remove any urls
  urls  =  body.find_all('a')
  if len(urls) > 0: body.a.clear()
  
  # remove code snippets
  codes = body.find_all('code')
  if len(codes) > 0: body.code.clear()
  
  # delete preformatted text
  pres = body.find_all('pre')
  if len(pres) > 0: body.pre.clear()
    
  # start with our list of words
  text = body.get_text()
  
  # blank list we will append to
  words = []
  
  # make lowercase and strip whitespace
  text = text.lower().strip()
  
  # remove punctuation
  text = re.sub(r'([^\s\w_]|_)+', '', text)
  
  # snowball stemmer object
  snowball = SnowballStemmer("english")
  
  
  # tokenize into sentences
  sents = nltk.sent_tokenize(text)
  for sent in sents:
    # tokenize each sentance into words
    for word in nltk.word_tokenize(sent):
      if word in stop_en: continue  # remove stopwords
      if len(word) < 3: continue  # remove words < 3 characters
      if not word.isalpha(): continue  # remove numbers
      
      words.append(snowball.stem(word))  # append stemmed version of word to list
  
  return words

In [0]:
from pyspark.ml.feature import StringIndexer

# convert sentiment into numerical values
def make_target_vector_numerical(df):
  sentiment_to_num = StringIndexer(inputCol='sentiment',outputCol='label')
  model = sentiment_to_num.fit(df)
  df = model.transform(df)
  return df
  

In [0]:
# imdb dataset
file_location = "/FileStore/tables/IMDB_Dataset.csv"
df_old = load_csv_to_df(file_location)

# 1000 manually labelled rows by our group
file_location_group16 = "/FileStore/tables/group16_movie_reviews.csv"
df_new = load_csv_to_df(file_location_group16)

# our consolidated dataframe
df_all = df_old.union(df_new)

In [0]:
# preprocess the body
df_old = df_old.withColumn('filtered_body', preprocess_body(df_old['review']))
df_new = df_new.withColumn('filtered_body', preprocess_body(df_new['review']))
df_all = df_all.withColumn('filtered_body', preprocess_body(df_all['review']))

# convert sentiment into numerical values
df_old = make_target_vector_numerical(df_old)
df_new = make_target_vector_numerical(df_new)
df_all = make_target_vector_numerical(df_all)

## Step 2: Create Numerical Feature Vectors

##### Create term frequency vector using HashingTF

In [0]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

# create term frequency vectors using HashingTF
# used to have numFeatures=20...
hashingTF = HashingTF(inputCol="filtered_body", outputCol="rawFeatures_htf")

df_old = hashingTF.transform(df_old)
df_new = hashingTF.transform(df_new)
df_all = hashingTF.transform(df_all)

##### Alternate version: Create term frequency vectors using CountVectorizer instead of HashingTF

In [0]:
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="filtered_body", outputCol="rawFeatures_cv")

cvModel_old = cv.fit(df_old)
cvModel_new = cv.fit(df_new)
cvModel_all = cv.fit(df_all)

df_old = cvModel_old.transform(df_old).cache()
df_new = cvModel_new.transform(df_new).cache()
df_all = cvModel_all.transform(df_all).cache()

In [0]:
# apply the Inverse Document Frequency (IDF)
idf = IDF(inputCol="rawFeatures_cv", outputCol="features_idf")

idfModel_old = idf.fit(df_old)
idfModel_new = idf.fit(df_new)
idfModel_all = idf.fit(df_all)

df_old = idfModel_old.transform(df_old).cache()
df_new = idfModel_new.transform(df_new).cache()
df_all = idfModel_all.transform(df_all).cache()

##### Create feature vector utilizing Word2Vec

In [0]:
from pyspark.ml.feature import Word2Vec

#Create Word2Vec Model with vector size = 100 and context size = 5
word2Vec = Word2Vec(vectorSize=100, windowSize=5, inputCol="filtered_body", outputCol="word2vec")

# creates word vectors
w2vModel_old = word2Vec.fit(df_old)
w2vModel_new = word2Vec.fit(df_new)
w2vModel_all = word2Vec.fit(df_all)

# average word vectors for each review into one review vector
df_old = w2vModel_old.transform(df_old).cache()
df_new = w2vModel_new.transform(df_new).cache()
df_all = w2vModel_all.transform(df_all).cache()

In [0]:
# our 3 cleaned dataframes to work with
df_old.cache()
df_new.cache()
df_all.cache()

Out[13]: DataFrame[review: string, sentiment: string, filtered_body: array<string>, label: double, rawFeatures_htf: vector, rawFeatures_cv: vector, features_idf: vector, word2vec: vector]

## Step 3: Test our features using some models
In this step, we run our feature vectors through 2 different models, just to verify everything is working as it should be

### Random Forest Model

##### Detemine the best feature to use

In [0]:
rf_cv_df = df_all.select(['label', 'rawFeatures_cv']).limit(1000).cache()
rf_idf_df = df_all.select(['label', 'features_idf']).limit(1000).cache()
rf_w2v_df = df_all.select(['label', 'word2vec']).limit(1000).cache()

In [0]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

def RandomForestAccuracy(df, label, feature, trees, depth):
  rf = RandomForestClassifier(labelCol=label, featuresCol=feature, numTrees=trees, maxDepth=depth, seed=42)
  (training, testing) = df.randomSplit([0.7,0.3], seed=42)
  
  # fit our model to the training data
  pred = rf.fit(training)

  # apply model to test data
  test_results = pred.transform(testing)
  acc_eval = MulticlassClassificationEvaluator()
  accuracy = acc_eval.evaluate(test_results)
  return accuracy

# determine accuracy of model using each type of feature
rf_cv_acc = RandomForestAccuracy(rf_cv_df, "label", "rawFeatures_cv", 10, 5)
rf_idf_acc = RandomForestAccuracy(rf_idf_df, "label", "features_idf", 10, 5)
rf_w2v_acc = RandomForestAccuracy(rf_w2v_df, "label", "word2vec", 10, 5)
print(f"Random Forest count vectorizer accuracy: {rf_cv_acc}")
print(f"Random Forest TF-IDF accuracy: {rf_cv_acc}")
print(f"Random Forest Word2Vec accuracy: {rf_w2v_acc}")

Random Forest count vectorizer accuracy: 0.5986220595595595
Random Forest TF-IDF accuracy: 0.5986220595595595
Random Forest Word2Vec accuracy: 0.7614382691278758


Word2Vec largely out performed count vectorizer and TF-IDF

##### Hypertune with best feature

In [0]:
# using cross validation
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

train, test = rf_w2v_df.randomSplit([0.7,0.3], seed=42)

rf = RandomForestClassifier(labelCol="label", featuresCol="word2vec", seed=42)
  
paramGrid = ParamGridBuilder()\
    .addGrid(rf.numTrees, [200, 300, 400, 500])\
    .addGrid(rf.maxDepth, [5, 10, 20, 30])\
    .build()

crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3, seed=42)

cvModel = crossval.fit(train)

bestModel = cvModel.bestModel
 
print("NumTrees best param: ")
print(bestModel._java_obj.getNumTrees())
print("maxDepth best param: ")
print(bestModel._java_obj.getMaxDepth())

NumTrees best param: 
400
maxDepth best param: 
20


##### Retrain Random Forest Model with best paramaters

In [0]:
# function used to give the accuracy, f1 score, precision, recall, and confusion matrix for a model on a specified dataframe. 
from pyspark.mllib.evaluation import MulticlassMetrics

def model_evaluation(df, model, dataSourceName):
  (training, testing) = df.randomSplit([0.7,0.3], seed=42)
  
  # fit our model to the training data
  pred = model.fit(training)

  # apply model to test data
  test_results = pred.transform(testing)
  
  
  # print out accuracy
  acc_eval = MulticlassClassificationEvaluator()
  accuracy = acc_eval.evaluate(test_results)
  
  
  # other metrics
  predictionAndLabel = test_results.select("prediction", "label").rdd
  multiMetrics = MulticlassMetrics(predictionAndLabel)
  precision = multiMetrics.weightedPrecision
  recall = multiMetrics.weightedRecall
  f1 = multiMetrics.weightedFMeasure()
  matrix = multiMetrics.confusionMatrix().toArray()
  
  print(f"Using model on {dataSourceName} data")
  print(f"Accuracy: {accuracy}")
  print(f"Precision: {precision}")
  print(f"Recall: {recall}")
  print(f"F1: {f1}")
  print("Confustion Matrix: ")
  print(matrix)
  print("-------------------")

In [0]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
rf = RandomForestClassifier(labelCol="label", featuresCol="word2vec", numTrees=400, maxDepth=10, seed=42)
rf_df_old = df_old.select(['label', 'word2vec']).cache()
rf_df_new = df_new.select(['label', 'word2vec']).cache()
rf_df_all = df_all.select(['label', 'word2vec']).cache()
model_evaluation(rf_df_old, rf, "report")
model_evaluation(rf_df_new, rf, "new")
model_evaluation(rf_df_all, rf, "combined")

Using model on report data
Accuracy: 0.8143344202546854
Precision: 0.814422687974296
Recall: 0.8143544506816359
F1: 0.8143344202546854
Confustion Matrix: 
[[5994. 1450.]
 [1328. 6192.]]
-------------------
Using model on new data
Accuracy: 0.6038373439387836
Precision: 0.6140769675925926
Recall: 0.6484375
F1: 0.6038373439387836
Confustion Matrix: 
[[146.  20.]
 [ 70.  20.]]
-------------------
Using model on combined data
Accuracy: 0.8253198782689769
Precision: 0.826251544733372
Recall: 0.8254404925656644
F1: 0.8253198782689769
Confustion Matrix: 
[[6510. 1140.]
 [1525. 6092.]]
-------------------


### Naive Bayes

In [0]:
nb_cv_df = df_all.select(['label', 'rawFeatures_cv']).limit(1000).cache()
nb_idf_df = df_all.select(['label', 'features_idf']).limit(1000).cache()

In [0]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
 
def NaiveBayesAccuracy(df, label, feature):
  nb = NaiveBayes(labelCol=label, featuresCol=feature)
  (training, testing) = df.randomSplit([0.7,0.3], seed=42)
  
  # fit our model to the training data
  pred = nb.fit(training)
 
  # apply model to test data
  test_results = pred.transform(testing)
  acc_eval = MulticlassClassificationEvaluator()
  accuracy = acc_eval.evaluate(test_results)
  return accuracy

# test our model on different features to find which is best
nb_cv_acc = NaiveBayesAccuracy(nb_cv_df, "label", "rawFeatures_cv")
nb_idf_acc = NaiveBayesAccuracy(nb_idf_df, "label", "features_idf")
print(f"Naive Bayes accuracy using count vectorizer features: {nb_cv_acc} ")
print(f"Naive Bayes accuracy using TF-IDF features: {nb_idf_acc} ")

Naive Bayes accuracy using count vectorizer features: 0.782381484639005 
Naive Bayes accuracy using TF-IDF features: 0.6818625281622144 


This shows us that using the count vectorizer features yield the best results

In [0]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

train, test = nb_cv_df.randomSplit([0.7,0.3], seed=42)
 
nb = NaiveBayes(labelCol="label", featuresCol="rawFeatures_cv")
  
# Create ParamGrid for Cross Validation
nbparamGrid = (ParamGridBuilder()\
               .addGrid(nb.smoothing, [5, 7, 9, 12, 15, 17, 19])\
               .addGrid(nb.modelType, ['multinomial', 'gaussian', 'complement'])\
               .build())

# Evaluate model
nbevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

# Create 5-fold CrossValidator
nbcv = CrossValidator(estimator = nb,
                    estimatorParamMaps = nbparamGrid,
                    evaluator = nbevaluator,
                    numFolds = 3)

# Run cross validations
nbcvModel = nbcv.fit(train)

bestModel = nbcvModel.bestModel 
print("smoothing best param: ")
print(bestModel._java_obj.getSmoothing())
print("modelType best param: ")
print(bestModel._java_obj.getModelType())

smoothing best param: 
12.0
modelType best param: 
complement


In [0]:
# tuned version
nb_tuned = NaiveBayes(labelCol="label", featuresCol="rawFeatures_cv", smoothing=12, modelType='complement')

# create dataframes to be used by my model (one for old, new, and combined)
nb_df_old = df_old.select(['label', 'rawFeatures_cv']).cache()
nb_df_new = df_new.select(['label', 'rawFeatures_cv']).cache()
nb_df_all = df_all.select(['label', 'rawFeatures_cv']).cache()

# evaluate model on all 3 datasets
model_evaluation(nb_df_old, nb_tuned, "report")
model_evaluation(nb_df_new, nb_tuned, "new")
model_evaluation(nb_df_all, nb_tuned, "combined")

Using model on report data
Accuracy: 0.852509537539015
Precision: 0.8526106223681558
Recall: 0.8525126971398022
F1: 0.852509537539015
Confustion Matrix: 
[[6399. 1045.]
 [1162. 6358.]]
-------------------
Using model on new data
Accuracy: 0.5190835367126936
Precision: 0.7736825980392157
Recall: 0.65234375
F1: 0.5190835367126936
Confustion Matrix: 
[[166.   0.]
 [ 89.   1.]]
-------------------
Using model on combined data
Accuracy: 0.8495263972452214
Precision: 0.8496787405067011
Recall: 0.8495447697648523
F1: 0.8495263972452214
Confustion Matrix: 
[[6578. 1072.]
 [1225. 6392.]]
-------------------


### Decision Tree

In [0]:
dt_cv_df = df_all.select(['label', 'rawFeatures_cv']).limit(1000).cache()
dt_idf_df = df_all.select(['label', 'features_idf']).limit(1000).cache()
dt_w2v_df = df_all.select(['label', 'word2vec']).limit(1000).cache()

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

def DecisionTreeAccuracy(df, label, feature, bins, depth):
  dt = DecisionTreeClassifier(labelCol=label, featuresCol=feature, maxBins=bins, maxDepth=depth, seed=42)
  (training, testing) = df.randomSplit([0.7,0.3], seed=42)
  
  # fit our model to the training data
  pred = dt.fit(training)

  # apply model to test data
  test_results = pred.transform(testing)
  acc_eval = MulticlassClassificationEvaluator()
  accuracy = acc_eval.evaluate(test_results)
  return accuracy

# determine accuracy of model using each type of feature
dt_cv_acc = DecisionTreeAccuracy(dt_cv_df, "label", "rawFeatures_cv", 10, 5)
dt_idf_acc = DecisionTreeAccuracy(dt_idf_df, "label", "features_idf", 10, 5)
dt_w2v_acc = DecisionTreeAccuracy(dt_w2v_df, "label", "word2vec", 10, 5)
print(f"Decision Tree count vectorizer accuracy: {dt_cv_acc}")
print(f"Decision Tree TF-IDF accuracy: {dt_cv_acc}")
print(f"Decision Tree Word2Vec accuracy: {dt_w2v_acc}")

Decision Tree count vectorizer accuracy: 0.6806923702193319
Decision Tree TF-IDF accuracy: 0.6806923702193319
Decision Tree Word2Vec accuracy: 0.7062911014224251


We see a slight edge with Word2Vec - since these models train quickly we can attempt some further tuning

In [0]:
# using cross validation and Word2Vec
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

train, test = dt_w2v_df.randomSplit([0.7,0.3], seed=42)

dt = DecisionTreeClassifier(labelCol="label", featuresCol="word2vec", seed=42)
  
paramGrid = ParamGridBuilder()\
    .addGrid(dt.impurity, ["entropy", "gini"])\
    .addGrid(dt.maxBins, [10, 20, 32])\
    .addGrid(dt.maxDepth, [5, 10, 20, 30])\
    .build()

crossval = CrossValidator(estimator=dt,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3, seed=42)

cvModel = crossval.fit(train)

bestModel = cvModel.bestModel
 
print("Impurity parameter: ")
print(bestModel._java_obj.getImpurity())
print("maxDepth best param: ")
print(bestModel._java_obj.getMaxDepth())

Impurity parameter: 
entropy
maxDepth best param: 
20


In [0]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
dt_tuned_w2v = DecisionTreeClassifier(labelCol="label", featuresCol="word2vec", maxBins=32, maxDepth=20, seed=42, impurity="entropy")
dt_df_old = df_old.select(['label', 'word2vec']).cache()
dt_df_new = df_new.select(['label', 'word2vec']).cache()
dt_df_all = df_all.select(['label', 'word2vec']).cache()
model_evaluation(dt_df_old, dt, "report")
model_evaluation(dt_df_new, dt, "new")
model_evaluation(dt_df_all, dt, "combined")

Using model on report data
Accuracy: 0.7385064513211624
Precision: 0.7385076898024705
Recall: 0.7385057471264368
F1: 0.7385064513211624
Confustion Matrix: 
[[5492. 1952.]
 [1961. 5559.]]
-------------------
Using model on new data
Accuracy: 0.5437450508710466
Precision: 0.5591667637465051
Recall: 0.53515625
F1: 0.5437450508710466
Confustion Matrix: 
[[96. 70.]
 [49. 41.]]
-------------------
Using model on combined data
Accuracy: 0.743561555046057
Precision: 0.7479101226352036
Recall: 0.744416060784699
F1: 0.743561555046057
Confustion Matrix: 
[[5250. 2400.]
 [1502. 6115.]]
-------------------


### Finding Outliers:

In [0]:
# get the test results of the best model to check predicitons

def get_test_results(df, model):
  (training, testing) = df.randomSplit([0.7,0.3], seed=42)
  
  # fit our model to the training data
  pred = model.fit(training)

  # apply model to test data
  test_results = pred.transform(testing)
  
  return test_results


Our best model is the Naive Bayes with Count Vectorizer - now we can find mismatched between predicitons and labels

In [0]:
#Inlcuding filtered_body to give context for misclassification
nb_df_all_with_body = df_all.select(['label', 'rawFeatures_cv', 'filtered_body']).cache()
nb_tuned_prediction = get_test_results(nb_df_all_with_body, nb_tuned)

# Filtering our data to find the predicted values and corresponding text
nb_prediction_df = nb_tuned_prediction.select("label", "prediction","filtered_body")

In [0]:
# find the sum of label and prediction, if it equals 1 we have a mismatch
summed_df = nb_prediction_df.withColumn('label_plus_prediction', nb_prediction_df.label + nb_prediction_df.prediction)
df_mismatch_predictions = summed_df.filter(summed_df.label_plus_prediction == 1)
df_mismatch_predictions.show()

+-----+----------+--------------------+---------------------+
|label|prediction|       filtered_body|label_plus_prediction|
+-----+----------+--------------------+---------------------+
|  0.0|       1.0|[giant, monster, ...|                  1.0|
|  0.0|       1.0|[wish, kid, movi,...|                  1.0|
|  0.0|       1.0|[premis, movi, ti...|                  1.0|
|  0.0|       1.0|[idiocraci, lates...|                  1.0|
|  0.0|       1.0|[think, cheesiest...|                  1.0|
|  0.0|       1.0|[rais, time, movi...|                  1.0|
|  0.0|       1.0|[mani, level, goo...|                  1.0|
|  0.0|       1.0|[perhap, one, fir...|                  1.0|
|  0.0|       1.0|[watch, cliffhang...|                  1.0|
|  0.0|       1.0|[recent, bought, ...|                  1.0|
|  0.0|       1.0|[someday, somebod...|                  1.0|
|  0.0|       1.0|[brain, blood, st...|                  1.0|
|  0.0|       1.0|[young, ladi, nam...|                  1.0|
|  0.0| 

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def array_to_string(my_list):
    return '[' + ','.join([str(elem) for elem in my_list]) + ']'

array_to_string_udf = udf(array_to_string, StringType())

df_mismatch_predictions = df_mismatch_predictions.withColumn('filtered_body', array_to_string_udf(df_mismatch_predictions["filtered_body"]))

In [0]:
df_mismatch_predictions.show()

+-----+----------+--------------------+---------------------+
|label|prediction|       filtered_body|label_plus_prediction|
+-----+----------+--------------------+---------------------+
|  0.0|       1.0|[giant,monster,fa...|                  1.0|
|  0.0|       1.0|[wish,kid,movi,st...|                  1.0|
|  0.0|       1.0|[premis,movi,tick...|                  1.0|
|  0.0|       1.0|[idiocraci,latest...|                  1.0|
|  0.0|       1.0|[think,cheesiest,...|                  1.0|
|  0.0|       1.0|[rais,time,movi,r...|                  1.0|
|  0.0|       1.0|[mani,level,good,...|                  1.0|
|  0.0|       1.0|[perhap,one,first...|                  1.0|
|  0.0|       1.0|[watch,cliffhang,...|                  1.0|
|  0.0|       1.0|[recent,bought,mo...|                  1.0|
|  0.0|       1.0|[someday,somebodi...|                  1.0|
|  0.0|       1.0|[brain,blood,star...|                  1.0|
|  0.0|       1.0|[young,ladi,name,...|                  1.0|
|  0.0| 

In [0]:
# display in order to download
display(df_mismatch_predictions)

label,prediction,filtered_body,label_plus_prediction
0.0,1.0,"[giant,monster,fan,see,yeti,absolut,must,especi,hear,much,thank,good,bootleg,market,abl,find,copi,pretti,easili,happili,surpris,upon,watch,flick,actual,dare,say,decentdec,actual,name,cheesi,giantmonst,flick,kick,pretti,quick,yeti,found,pretti,much,immedi,get,introduc,various,charact,consist,sleazi,one,good,one,girl,pretti,much,one,downright,strike,beauti,girl,cheesi,scifi,film,faryeti,look,like,longhair,guy,straight,origin,woodstock,concert,realli,hes,bad,dude,especi,introduc,world,kind,funki,cagelik,thing,godzilla,despit,rude,awaken,doesnt,even,rampag,actual,rare,destroy,anyth,whole,pictur,kinda,look,puzzl,tri,figur,thing,yeti,seem,understand,english,pretti,nice,copi,dub,english,know,good,guy,bad,guy,arehowev,want,see,giant,yeti,thing,hes,pretti,much,whole,movi,typic,lowbudget,fashion,seem,chang,size,lot,depend,scene,there,even,bunch,fake,leg,shot,stand,therey,special,effect,arent,greatest,definit,good,one,scene,yeti,smash,warehous,done,well,anoth,use,window,build,ladder,step,climb,top,shatter,window,foot,often,shock,occup,insid,one,sequenc,realli,look,much,much,better,bad,movieyeti,never,stoop,low,say,ape,actual,time,even,come,close,genuin,silli,beauti,girl,caus,yeti,nippl,becom,erect,lift,eyebrow,yeah,babi,manner,even,isnt,bad,kinda,even,get,laugh,viewerth,movi,pretti,long,kind,thing,surpris,enough,doesnt,get,bore,stori,actual,good,watch,utter,gorgeous,actress,screen,make,male,viewer,happyyeti,may,upper,echelon,giant,monster,flick,definit,better,king,kong,ripoff,like,ape,queen,kong,far]",1.0
0.0,1.0,"[wish,kid,movi,still,made,way,dark,deep,get,charact,develop,charli,epitom,dynam,charact,plot,develop,superb,anim,emot,involv,ration,relat,consist,theme,hand,songandd,routin,would,never,thought,kid,movi,give,high,rate,movi,excel,film,let,alon,kid,movi,bring,second,point,got,darkest,kid,movi,ive,seen,quit,time,come,shock,see,child,age,complet,terrifi,throughout,great,deal,latter,half,first,half,movi,end,one,saddest,end,could,ever,come,across,ala,jurass,bark,futurama,fan,make,movi,good,movi,univers,evok,emot,dont,normal,like,feel,assum,bad,make,movi,bad,fact,mean,succeed,good,funni,movi,suppos,make,laugh,good,horror,movi,suppos,make,scare,good,sad,movi,suppos,make,sad,point,good,movi,suppos,move,simpli,entertain,movi,move,mealso,movi,incred,violent,today,standard,kid,movi,contain,subject,matter,today,standard,may,suitabl,children,parent,say,watch,first,usual,one,say,anyth,kind,thing,saw,yesterday,came,surpris,even]",1.0
0.0,1.0,"[premis,movi,tickl,imagin,quit,time,weve,heard,read,kind,context,would,alon,world,would,entir,world,sudden,disappear,front,eye,fact,last,part,actual,happen,dave,andrew,two,roommat,live,rundown,hous,middl,freeway,system,andrew,nervous,wreck,say,least,dave,consid,one,biggest,loser,societi,alon,main,reason,two,guy,get,well,along,simpli,turn,comfort,need,untilstraight,begin,film,lot,lot,problem,happen,get,involv,crime,andrew,suffer,paranoia,simpli,doesnt,dare,go,hous,dave,unsuccess,job,colleagu,dont,treat,well,respect,deserv,amount,problem,face,keep,increas,one,day,may,face,inevit,deal,much,wish,everyth,would,away,cours,exact,happensth,rest,stori,place,dave,andrew,world,nothing,first,surpris,problem,understand,deal,featur,crazi,environ,later,find,everyth,want,seem,one,still,leftnoth,featur,incred,small,cast,fact,besid,first,coupl,shot,film,see,dave,david,hewlett,andrew,andrew,miller,entir,film,clear,order,pull,cast,task,world,noth,exist,noth,distract,viewer,way,vincenzo,decid,use,reason,amount,closeup,head,shot,make,interest,actual,work,quit,well,director,photographi,derek,roger,also,nice,way,teas,audienc,withhold,visual,inform,especi,time,charact,see,someth,react,dont,see,right,awayobvi,cant,event,driven,film,much,action,happen,outsid,hous,move,around,void,that,hilari,scene,take,place,especi,case,andrew,discov,candi,barnow,one,could,think,noth,look,like,well,look,like,noth,inde,entir,world,noth,white,white,matter,direct,look,weak,film,hour,less,get,extrem,bore,look,event,make,sure,interest,look,thank,god,exampl,time,two,lad,due,properti,noth,abl,jump,realli,high,noth,made,tofu,andrew,claim,fun,see,instant,abl,use,noth,becom,god,littl,societyon,best,part,film,set,product,design,jasna,stefanov,done,beauti,job,film,hous,two,guy,live,unnatur,fun,look,still,seem,right,two,live,place,like,product,design,doubt,one,power,aspect,film,realli,make,film,worth,watchinghowev,best,part,film,act,david,hewlett,andrew,miller,realli,look,like,profession,actor,camera,everi,second,film,previous,said,prop,film,realli,bare,stage,plenti,charact,develop,decent,onelin,clever,dialogu,time,hilari,stupid,work,end,realli,move,movi,away,lowbudget,area,wellcraft,handworklet,talk,littl,visual,effect,definit,worth,mention,noth,featur,digit,visual,effect,prosthet,equal,modern,horror,film,there,rather,horrifi,dream,sequenc,film,although,drew,milk,scene,complet,still,fun,watch,one,best,visual,effect,film,end,andrew,dave,sudden,discov,power,environ,abil,wish,everyth,away,way,around,make,thing,appearnoth,bright,welllit,movi,realli,help,promot,idea,probabl,dead,fact,one,theori,noth,comedi,slowli,destroy,theori,dont,know,happen,dont,know,ever,get,movi,end,see,anyth,like,end,way,good,couldv,rather,easi,predict,go,happen,still,writer,thought,incid,help,make,littl,interest,end,reason,satisfactori,onetak,hollow,man,kill,bill,cube,epoch,lot,film,noth,realli,amalgam,differ,style,still,film,least,know,noth,realli,like,peopl,rememb,origin,cube,product,commentari,dvd,may,rememb,vincenzo,natali,talk,came,stori,cube,talk,andré,bijel,roommat,time,extrem,dull,room,hope,get,noth,could,well,screen,version,origin,cube,stori,end,almost,like,one,cube,prequelswhat,say,enjoy,noth,great,movi,differ,part,movi,extrem,wellmad,ton,intellig,idea,still,feel,movi,miss,someth,problem,find,precis,mayb,noth,answer,question,noth,great,film,good,expect,befin,rate]",1.0
0.0,1.0,"[idiocraci,latest,film,come,mike,offic,space,judg,certain,follow,similar,theme,film,fact,observ,stupid,mediocr,overcom,advers,relat,speak,stori,joe,bauer,luke,wilson,quit,liter,averag,guy,exist,joe,prostitut,name,rita,maya,rudolph,becom,test,subject,militari,project,hibern,chamber,remain,suspend,one,year,due,lack,oversight,joe,rita,forgotten,accident,wake,year,futureher,scari,part,film,explain,realist,plausibl,way,entir,popul,becam,absolut,retard,natur,predat,evolut,human,speci,necessarili,favor,quickest,smartest,strongest,peopl,progress,gene,peopl,breed,unfortun,peopl,happen,welfaresuck,trailer,trash,idiot,breed,like,rabbit,abund,reproduct,stupid,peopl,caus,advers,effect,societ,growth,joe,rita,two,smartest,human,be,face,planet,help,imagin,entir,popul,hybrid,redneck,jock,cholo,hoochi,see,nightmarish,dystopia,joe,learn,attempt,track,time,machin,see,rita,get,back,came,that,basic,whole,plotbut,despit,onedimension,may,make,sound,movi,higher,brow,fathom,nuanc,everywher,anyon,see,glimps,warn,sign,modern,day,dumbcieti,permeat,facet,everyday,life,turn,train,wreck,display,idiocraci,film,truli,awesom,showcas,realist,retarded,put,pedest,dont,want,give,anyth,away,ruin,joke,let,say,pretti,thorough,see,would,say,lot,toilet,humor,odd,may,seem,purpos,show,dumb,crass,peopl,arethi,film,unfortun,destin,see,fate,predecessor,offic,space,one,see,theater,everyon,brag,discov,awesomefunni,movi,come,video,complaint,film,would,flow,narrat,sometim,get,broken,hitchhikersguidetothegalaxi,type,exposit,thing,got,necessari,evil,implement,better,good,charact,funni,joke,betterthanaverag,social,commentari,wrap,funni,bowfin,note,see,youth,becom,gangbang,wannab,act,like,redneck,ghetto,trash,proud,educ,cultur,anyway,see,countri,spiral,control,abyss,stupid,god,sake,watch,movi]",1.0
0.0,1.0,"[think,cheesiest,guilti,pleasuretyp,movi,first,thing,think,slasher,flick,realli,bad,slasher,flick,formula,type,film,script,need,part,blood,sever,part,nuditi,get,madeflash,forward,late,slasher,flick,revit,success,scream,like,film,formula,mask,lack,inspir,label,hip,tongueincheek,parodi,origin,slasher,flick,recent,blend,hip,parodi,neoslash,flick,one,worth,see,lowbudget,directtovideo,cutlik,new,slasher,flick,cut,reli,product,slasher,flick,case,fiction,film,hot,blood,make,commentari,genr,hot,blood,never,finish,product,kill,someon,wear,mask,film,killer,scarman,bald,figur,mouth,stitch,close,dark,pupilless,eye,year,later,group,film,student,whose,professor,involv,product,decid,vault,tap,origin,surviv,actress,finish,film,everi,time,film,screen,scene,shot,scarman,return,someon,die,quot,taglin,finish,film,finish,themthi,sound,realli,bad,degre,realli,thing,good,slasher,flick,charact,develop,new,director,reveal,daughter,hot,blood,origin,director,whose,life,appar,ruin,product,cancel,wouldv,perfect,detail,work,plot,yet,never,mention,like,slasher,flick,mani,bodi,care,actor,arent,great,even,directtovideo,standard,fun,charact,arent,inadvert,charact,act,sinc,none,charact,film,want,work,hot,blood,particular,whoever,lucki,enough,play,scarman,cut,climax,big,dunnit,unmask,killer,like,scream,film,doesnt,gimmick,kill,urban,legend,film,origin,interest,concept,dilut,way,write,sequel,sell,well,end,that,par,coursebi,sensibl,view,standard,horribl,movi,avoid,qualiti,make,true,root,slasher,genr,make,enjoy,neoslash,flick]",1.0
0.0,1.0,"[rais,time,movi,releas,probabl,influenc,shallow,mind,still,isnt,bad,movi,mean,movi,hostag,situat,involv,prep,school,popul,extent,endear,teenag,boy,cant,seem,get,troubl,what,wrong,doesnt,big,special,effect,need,special,effect,cinema,declin,began,around,time,special,effect,popular,coincid,think,turn,movi,potenti,good,plot,feel,turn,big,substanceless,light,show,innoc,kid,selfmed,well,know,movi,need,special,effect,three,fourth,movi,imdb,top,without,special,effect,almost,top,gross,movi,time,special,effect,think,star,war,ghostbust,etc,good,movi,rest,topgross,movi,usual,clich,tripe,nonsens,plot,lot,eye,candi,well,movi,dont,need,junkexcus,go,tangent,normal,fed,special,effect,junk,back,point,toy,soldier,simpli,great,movi,admit,content,littl,corni,rip,everi,movi,rip,anoth,extent,think,resovoir,dog,countless,appreci,site,dictat,fact,belov,quentin,tarentino,admit,like,copi,mani,mani,mani,movi,make,first,major,film,reservoir,dog,mani,say,entir,plot,rip,almost,scene,scene,japanes,chines,gangster,movi,tarentino,love,much,probabl,still,sorri,tangenttoy,soldier,fun,whole,insubordin,teenag,unwant,member,author,hostag,taker,fun,see,kid,take,theyr,held,someth,dont,want,hell,teenag,angstinspir,rebelion,key,topic,great,major,comedi,plus,there,tension,thrill,charact,use,firearm,knock,bad,guy,etc,plus,there,emot,point,film,one,charact,die,other,cope,adjust,perfect,act,beat,tripe,therein,short,toy,soldier,excit,interest,fun,dare,jade,blowhard,rate,movi,poor,shame,allperson,rate]",1.0
0.0,1.0,"[mani,level,good,fact,consid,lowbudget,british,indi,first,time,featuredirector,larg,neophyt,cast,magnific,achiev,dont,know,much,cost,figur,bandi,public,never,know,reliabl,figur,like,point,film,look,like,cost,coupl,million,quid,clear,cost,tini,fraction,great,special,effect,terrif,product,design,effect,prop,costum,excel,photographi,good,act,direct,impress,score,absolut,stun,sound,mix,even,said,much,script,great,charact,clear,identifi,someth,movi,ten,men,dress,rough,one,locat,would,easi,nameless,faceless,blank,ten,charact,most,done,dialogu,way,react,thing,throughout,middl,act,plot,develop,script,told,stori,well,show,affect,charact,whole,film,like,second,act,would,stunningbefor,ship,blow,twelv,peopl,make,individu,escap,pod,epod,blast,away,ship,theyr,much,automat,metal,coffin,poor,sod,insid,trap,cramp,real,idea,theyr,go,make,sens,like,epod,theyr,excel,idea,done,well,make,sens,nice,roomi,escap,capsul,also,like,way,specif,told,later,design,shiptoship,escap,make,planetfal,emerg,let,face,guy,bloodi,lucki,ship,blown,close,planet,said,doesnt,look,like,unus,epod,still,freighter,wonder,prison,abl,get,epod,get,occur,shouldnt,captain,gone,ship,rather,first,guy,anyway,epod,land,barren,planet,noth,sand,spars,veget,least,sandi,spars,veget,part,planet,may,ici,wast,lush,jungl,elsewher,nah,planet,scifi,movi,exact,accept,epod,come,within,mile,ten,survivor,abl,meet,fire,flare,sky,locat,otherth,captain,muscular,mountain,man,could,pretti,good,career,action,flick,get,right,agent,decid,tri,contact,captain,behan,intend,rendezv,planet,need,get,orbit,engin,say,combin,power,unit,two,epod,probabl,give,one,enough,juic,lift,antigrav,doodad,high,enough,blast,atmospher,done,automat,need,pilot,send,signal,captain,valiant,volunt,commend,sensibl,move,engin,point,put,heaviest,man,somewhat,dodgili,repair,epod,ridicul,need,lightest,member,team,that,kid,realli,like,way,point,name,david,captain,start,use,treat,digniti,respect,good,storytel,good,characteris]",1.0
0.0,1.0,"[perhap,one,first,slasher,film,came,halloween,although,made,irwin,yabalan,halloween,must,say,honest,found,tourist,trap,scarier,funner,tourist,trap,one,remark,treat,find,everi,left,enjoy,feel,surpris,destini,tell,one,night,local,blockbust,one,two,month,went,busi,noth,get,stumbl,upon,movi,think,huh,seem,like,laughabl,bmovi,rent,took,home,boy,good,scare,tourist,trap,come,bad,movi,definit,cheesi,momentsbut,end,your,much,fun,realli,careth,thing,impress,absolut,thing,made,movi,one,scariest,one,ive,seen,number,one,set,horror,movi,without,good,set,isnt,fun,love,love,love,locat,feel,like,relat,almost,feel,like,weve,make,creepier,next,charact,non,realli,stereotyp,real,person,exampl,theyr,stoner,alcohol,even,sex,obsess,peopl,feel,like,normal,young,adult,plus,look,realist,enough,chuck,connor,give,great,perform,slausen,take,instant,like,hes,real,feel,like,nice,guy,grandfath,figur,ador,last,probabl,import,thing,make,movi,scari,make,audienc,jump,half,way,seat,turn,right,right,reason,exampl,time,horror,film,main,jump,sudden,chang,music,pitch,tourist,trap,prepar,that,case,perfect,use,light,mannequin,weird,feel,utter,creep,plus,love,although,may,bit,overthetop,thing,still,feel,like,could,happen,exact,horror,movi,take,ridicul,premis,make,audienc,feel,unsaf,terrifiedoveral,far,major,problem,none,weird,may,get,littl,weird,time,howev,end,tourist,trap,hold,place,near,dear,heart,one,horror,film,make,turn,light,feel,unsaf,travel]",1.0
0.0,1.0,"[watch,cliffhang,make,nostalg,earli,time,virtual,everi,new,action,movi,could,describ,die,hard,cliffhang,die,hard,mountain,pretti,good,isbut,unlik,passeng,sieg,decent,die,hard,clone,term,cliffhang,dispens,enclos,feel,mani,action,movi,embrac,breathtak,landscap,immens,threaten,overwhelm,trivial,conflict,peopl,fight,die,among,peaksyear,movi,like,simpl,plan,fargo,dramat,crime,murder,snowbound,locat,cliffhang,director,renni,harlin,recogn,visual,impact,juxtapos,brutal,violenc,grim,struggl,surviv,cold,indiffer,natur,surroundingsth,open,sequenc,alreadi,receiv,substanti,prais,deserv,intens,allow,forget,artific,camera,actor,simpli,believ,see,actual,happen,even,harlin,shot,fall,stuf,anim,power,effect,still,threaten,becom,much,joke,repeat,deep,blue,sea,ridicul,express,ralph,wait,face,dim,sequenc,powerth,next,impress,setpiec,gunfight,heist,aboard,jet,written,stallon,michael,franc,direct,harlin,audienc,plung,action,initi,know,agent,involv,theft,bloodi,doublecross,complet,unexpect,roger,ebert,observ,stuntman,made,midair,transfer,plane,deserv,special,recognitionlat,avalanch,sequenc,one,terroriststhiev,appear,actual,fall,wall,snow,carri,mountain,far,know,one,kill,make,movi,small,miracl,consid,extrem,natur,stunt,obvious,dummi,use,shot,shot,remain,impress,left,wonder,harlin,like,one,secondunit,director,knew,exact,place,camerail,take,sli,stallon,action,hero,day,week,hes,one,movi,star,ive,ever,seen,whos,complet,convinc,someon,withstand,lot,physic,emot,pain,time,actual,feel,pain,role,gabe,walker,realli,complement,stallon,act,strength,play,older,vulner,kind,action,hero,give,impress,lowkey,perform,mountain,rescuer,must,redeem,himselfin,contrast,mani,today,postmatrix,comic,bookinspir,action,hero,stallon,walker,ordinari,man,becom,hero,without,paranorm,computerenhanc,abil,cliffhang,hero,almost,freez,death,cloth,start,show,big,tear,bare,escap,one,danger,situat,anoth,winc,hes,hit,bleed,hes,cut,particular,cavern,sequenc,take,rockystyl,pummel,one,maddog,villainsit,note,utter,despic,villain,realli,contribut,movi,effect,first,saw,movi,teenag,root,good,guy,everi,step,way,anticip,anoth,bad,guy,would,bite,dust,rather,ice,one,point,actual,cheer,one,coldblood,charact,movi,deserv,suffer,violent,demiselithgow,british,accent,unconvinc,movi,occasion,model,plane,model,helicopt,hes,fundament,good,actor,one,perfect,recit,silli,dialogu,one,scene,look,hostag,stallon,rooker,tri,decid,task,give,actual,say,stay,fetch,even,better,actor,anthoni,hopkin,might,troubl,lineeven,cliffhang,occasion,toss,credibl,asid,sake,entertain,showear,movi,exampl,lithgow,open,say,one,men,retir,stallon,come,real,crimin,mastermind,would,made,mistak,even,unconsci,careless,allow,rooker,shout,warn,sli,rock,face,precipit,grip,tugofwar,stallon,bad,guy,tri,pull,rope,tie,leglithgow,could,given,order,subtl,mean,sequenc,might,much,fun,watch,hadnt,given,rooker,opportun,open,defi,arrog,captordon,much,style,saturday,matine,serial,time,western,cliffhang,built,solid,foundat,surviv,weak,element,would,undermin,lesser,filmbesid,pain,obvious,aircraft,model,mention,weak,moment,includ,coupl,scene,shot,cheap,indoor,set,realli,fake,snow,well,two,scene,involv,bat,wolv,seem,unnecessari,alreadi,actionpack,narrat,final,harlin,decis,film,death,scene,slow,motion,seem,pointless,sinc,techniqu,contribut,noth,scenesit,shame,stallon,old,action,movi,charact,movi,seem,credibl,inevit,wonder,would,like,year,later,perhap,best,cliffhang,stand,time,without,sequel,enough,tire,obsolet,movi,franchis,alreadi,unoffici,sequel,call,vertic,limit,compar,clinker,cliffhang,belong,imdb,top,listrat,good,especi,consid,stallon,movi]",1.0
0.0,1.0,"[recent,bought,movi,three,buck,garag,sale,glad,didnt,pay,usual,dvd,pleasant,surpris,good,film,wasit,set,like,horror,antholog,broken,tale,includ,connector,stori,involv,four,teenag,whos,car,break,dark,lone,road,middl,night,appar,kiddo,like,horror,stori,that,decid,morn,tell,spooki,stori,around,campfir,charact,take,turn,tell,stori,stori,wrap,nice,littl,twistth,first,stori,hook,kinda,wast,time,bit,bland,dull,luckili,one,main,stori,last,mayb,minut,intend,mere,introduc,film,easi,overlook,unorigin,pieceth,second,stori,honeymoon,okay,guess,two,peopl,honeymoon,theyr,travel,around,head,las,vega,decid,stop,somewher,night,theyr,quick,warn,mysteri,stranger,leav,locat,risk,attack,danger,unknown,creatur,stori,pretti,good,setup,mere,deliveri,basic,fair,entertain,mysteri,till,monster,show,kinda,iffyth,third,stori,peopl,lick,person,favorit,involv,young,girl,whos,parent,go,night,whos,older,sis,ditch,parti,shes,go,alon,fact,make,known,internet,buddi,troubl,internet,buddi,exact,thirteenyearold,girl,stori,conclus,slight,less,climat,might,like,seen,still,pretti,dang,goodth,final,stori,locket,definit,one,pack,atmospher,set,creepi,mansion,dark,raini,night,tale,young,man,play,glenn,quinn,travel,around,motorcycl,stumbl,across,hous,time,conveni,problem,bike,meet,mute,girl,live,hous,fall,love,first,sight,unfortun,everyth,fine,dandi,might,someth,locket,hang,around,pretti,neckaft,stori,wrap,present,twist,involv,four,teen,car,twist,retrospect,obvious,didnt,realli,see,come,quit,pleas,conclus,filmso,good,movi,rent,unfortun,dont,realli,think,lot,rewatch,valu,next,time,your,mood,vagu,scari,litt,flick,vein,tale,darksid,someth,grab,movi,popcorn,turn,light,treat,surpris,nifti,littl,flick]",1.0
