# Post Processing: validate model with reddit data
- 1) add vader sentiment to preprocessed reddit posts and comments 
- 2) predict spoiler with trained-model (input: wordvec + sentiment score)
- 3) validation report
- 4) output db tables: posts, comments; each contains origin post, karma, sentiment, and pred

In [17]:
import json
from pprint import pprint
from IPython.display import display
import pandas as pd
import arrow
import numpy as np

from multiprocessing import Pool

import _pickle as Pickle
loadPickle = lambda f: Pickle.load(open(f, 'rb'))

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from sklearn.metrics import accuracy_score, classification_report

## Read Reddit data and word vector

In [18]:
com1 = pd.read_csv('comments2018final_all.csv', low_memory=False)
com2 = pd.read_csv('comments2019final_all.csv', low_memory=False)
com2018 = loadPickle('comment2018vec.p')
com2019 = loadPickle('comment2019vec.p')

com = pd.concat([com1, com2], sort=True)
com_vec = np.hstack([com2018, com2019])

######################################################################
post1 = pd.read_csv('posts2018final_all.csv', low_memory=False)
post2018 = loadPickle('movie2018title_vec.p')
post2 = pd.read_csv('posts2019final_all.csv', low_memory=False)
post2019 = loadPickle('movie2019title_vec.p')

# Join data
pos = pd.concat([post1, post2], sort=True)
pos_vec = np.hstack([post2018, post2019])
com = com.reset_index(drop=True)
pos = pos.reset_index(drop=True)

# Clean NA
com_id = com[com.body_clean.apply(type)!=np.float].index
pos_id = pos[pos.title_clean.apply(type)!=np.float].index
pos.id = pos.id.astype(str)
pos_id = np.setdiff1d(pos_id, pos[pos['id']=='inf'].index)

com = com[com.index.isin(com_id)]
pos = pos[pos.index.isin(pos_id)]
com_vec = com_vec[com_id]
pos_vec = pos_vec[pos_id]

# ######################################################################
print(f'comment vec shape: {com_vec.shape[0]:>6}\ncomment df shape: {com.shape[0]:>7}\n'+
     f'post vec shape: {pos_vec.shape[0]:>9}\npost df shape: {pos.shape[0]:>10}')

comment vec shape:  21105
comment df shape:   21105
post vec shape:     16490
post df shape:      16490


## Add Sentiment Score

In [19]:
analyser = SentimentIntensityAnalyzer()
def sent(sentence):
    return analyser.polarity_scores(sentence)['compound']

In [20]:
# PRED SENTIMENT
pool = Pool(32)
com['vader'] = com.body_clean.apply(sent)
pos['vader'] = pos.title_clean.apply(sent)
pool.close()
pool.join()

In [21]:
# LOAD SPOILER PRED MODEL 

model = loadPickle('lgb.p')


In [22]:
# PREPARE DATA FOR MODEL PREDICT
com_vec = np.array([x for x in com_vec]).reshape(-1, 300)
pos_vec = np.array([x for x in pos_vec]).reshape(-1, 300)
com_vec = np.hstack([com_vec, com.vader.values.reshape(-1, 1)])
pos_vec = np.hstack([pos_vec, pos.vader.values.reshape(-1, 1)])

In [23]:
com['pred'] = model.predict(com_vec).astype(int)
pos['pred'] = model.predict(pos_vec).astype(int)

In [24]:
com.pred.unique()
pos.pred.unique()

array([0, 1])

In [25]:
# model.best_estimator_.feature_importances_

In [26]:
com['time'] = com.created_utc.apply(lambda t: arrow.get(t))
pos['time'] = pos.created_utc.apply(lambda t: arrow.get(t))

In [27]:
com.score = com.score.astype(int)
# pos.score

In [28]:
com[['id', 'link_id', 'author', 'score', 'pred', 'vader', 'body', 'time']].to_csv('DB_comments.csv', index=False)
pos[['id', 'author', 'score', 'pred', 'vader', 'num_comments', 'title', 'time']].to_csv('DB_posts.csv', index=False)

In [29]:
# pos[pos.index==6829]['id']

In [30]:
print('spoiler examples:') # stopword not incldue spolier
[x for x in pos[pos.title_clean.str.contains('spoiler')].title][0:5]

spoiler examples:


['Batman returns and the destruction of childhood [SPOILERS]',
 "Seeing as there isn't an official /r Movies discussion, *I, Tonya* (Unofficial Spoilers Discussion)",
 'What was the best ending scene of 2017? (SPOILERS)',
 '[Spoilers] Something really interesting in “Lady Bird” that I don’t see anyone talking about',
 'SPLIT parallel scene(SPOILERS)']

# reddit validation

In [31]:
spos = pos[(pos.title.apply(lambda s: s.lower()).str.contains('spoiler')) & \
    (~pos.title.apply(lambda s: s.lower()).str.contains('no spoiler'))]\
     [['title', 'vader', 'pred']]
scom = com[(com.body.apply(lambda s: s.lower()).str.contains('spoiler')) & \
    (~com.body.apply(lambda s: s.lower()).str.contains('no spoiler'))]\
     [['body', 'vader', 'pred']]

scom = scom.rename({'body':'title'}, axis=1)
spos['is_spoiler'] = 1
scom['is_spoiler'] = 1
s = spos.append(scom, sort=True)
s.head(2)
s.shape

(390, 4)

In [32]:
# pos[pos.title.apply(lambda x: x.lower()).str.contains('no spoiler')]
# com[com.body.apply(lambda x: x.lower()).str.contains('no spoiler')]

In [33]:
# ns = pos[~pos.title_clean.str.contains('spoiler')][['title', 'vader', 'pred']].\
#     sample(n=len(s), random_state=12)
ns = pos[pos.title.apply(lambda x: x.lower()).str.contains('no spoiler')][['title', 'vader', 'pred']]
ns['is_spoiler'] = 0

In [34]:
# cs = com[~com.body_clean.str.contains('spoiler')][['body', 'vader', 'pred']].\
#     sample(n=len(s), random_state=12)
cs =  com[com.body.apply(lambda x: x.lower()).str.contains('no spoiler')][['body', 'vader', 'pred']]
cs['is_spoiler'] = 0
cs = cs.rename({'body':'title'}, axis=1)
cs

Unnamed: 0,title,vader,pred,is_spoiler
1504,"In the last Jedi (no spoilers), there is a shi...",-0.1531,1,0
3682,Most critics said that his career was over whe...,0.296,0,0
14693,"Hm. Haven't seen Incendies, but I saw Heridita...",0.4287,0,0


In [35]:
reddit_test = s.append(ns, sort=True).append(cs, sort=True)
display(reddit_test.head(10))

Unnamed: 0,is_spoiler,pred,title,vader
15,1,1,Batman returns and the destruction of childhoo...,-0.5719
35,1,0,Seeing as there isn't an official /r Movies di...,0.0
42,1,0,What was the best ending scene of 2017? (SPOIL...,0.6369
50,1,0,[Spoilers] Something really interesting in “La...,0.4576
101,1,0,SPLIT parallel scene(SPOILERS),0.0
168,1,0,Did Joi really love K in Blade Runner 2049? Wh...,0.6697
187,1,1,Can we all shut up about bombs in space now?! ...,-0.4939
209,1,1,[Potential Spoilers] In regards to Star Wars V...,-0.6249
261,1,1,I have a question about the ending of Dunkirk ...,0.0
314,1,1,Bryan Young explains Canto Bight's importance ...,0.3612


In [36]:
[x for x in reddit_test.head(10).title]

['Batman returns and the destruction of childhood [SPOILERS]',
 "Seeing as there isn't an official /r Movies discussion, *I, Tonya* (Unofficial Spoilers Discussion)",
 'What was the best ending scene of 2017? (SPOILERS)',
 '[Spoilers] Something really interesting in “Lady Bird” that I don’t see anyone talking about',
 'SPLIT parallel scene(SPOILERS)',
 'Did Joi really love K in Blade Runner 2049? What’s your take on it? spoilers*',
 'Can we all shut up about bombs in space now?! (Last Jedi Spoilers) from the official visual guide by Pablo Hidalgo',
 '[Potential Spoilers] In regards to Star Wars VIII happening so closely to VII, would adding a fourth movie to the storyline create enough wiggle room to further develop the conflict and answer questions that were previously left open-ended?',
 'I have a question about the ending of Dunkirk [Spoilers]',
 "Bryan Young explains Canto Bight's importance to the story of The Last Jedi [all spoilers]"]

## Summary

In [37]:
print(classification_report(reddit_test.is_spoiler, reddit_test.pred))

              precision    recall  f1-score   support

           0       0.08      0.75      0.14        28
           1       0.95      0.37      0.53       390

    accuracy                           0.39       418
   macro avg       0.52      0.56      0.34       418
weighted avg       0.90      0.39      0.51       418



- stop words should have included "no" so n-grams will catch it?
- observation: not all spolier marked