In [2]:
import pandas as pd

In [33]:
# Setup

from google.cloud import bigquery
from google.oauth2 import service_account

%load_ext google.cloud.bigquery
key_path = "../reddit-sentiment-analysis-0e7efc226db6.json"

credentials = service_account.Credentials.from_service_account_file(
    key_path,
    scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

bqclient = bigquery.Client(
    credentials=credentials,
    project=credentials.project_id,
)

import pandas as pd
import re
from unidecode import unidecode
from html.parser import HTMLParser
import sparknlp

spark = sparknlp.start()
from sparknlp.pretrained import PretrainedPipeline 

import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def clean_markdown(sentence):
    text = strip_tags(sentence)
    text = re.sub(r'\[.*\]\(https?:\/\/.*\)|https?:\/\/.*', '', text, flags=re.MULTILINE)
    text = unidecode(str(text))
    text = re.sub(r'\.+', ".", text)
    return text

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


In [34]:
"""

NOTE:

1. Assuming that the post is really discussing the entity if the entity name is mentioned 
2. Ignoring the post information with this query entirely - using it only to fetch comments that mention the entity 
   even once in the post text - this can be checked with spacy as Tim had pointed out
3. Grabbing only the comments which are direct replies or the ones which mention the entity name

ISSUES:

1. No way to detect if a comment without explicit mention of entity is highly relevant. For example, 
   if discusses the CEO
2. 

"""
def query_on_reddit_bigquery(company):
    query_comments = (
            "SELECT body, parent_id, link_id, ups, downs, score FROM `homework2-255022.redditbigdata.comments` "
            "WHERE link_id in ("
                "SELECT CONCAT('t3_', id) FROM `homework2-255022.redditbigdata.posts` "
                "WHERE num_comments > 2 AND REGEXP_CONTAINS(title, r'(?i)" + company + "')"
            ") AND (parent_id=link_id OR REGEXP_CONTAINS(body, r'(?i)" + company + "'))"
        )
    job_comments = bqclient.query(query_comments, location="US",)  # API request - starts the query
    df_comments = (job_comments.result().to_dataframe())
    return df_comments

In [52]:
tesla_comments_df = query_on_reddit_bigquery("tesla")

In [54]:
len(tesla_comments_df)

1757

In [48]:
all_comments_raw = tesla_comments_df.body.tolist()

In [58]:
tesla_comments_df[['link_id', 'body']].to_csv("comments.csv", index=False)

In [49]:
all_comments_raw

["but Trump's stupid adult children and son in law (who couldn't pass a  legitimate clearance) is fine LOL",
 'Of course, that 920 million is just what is coming due on March 1.  Their total debt is well over 10 times that much, almost 10 billion last I looked.  As far as comparing debt to market cap, market cap is a pretty volatile number.  Hell, Facebook lost over $100 billion in market cap in a day.  The whole problem for Tesla is their stock price is low enough where the lenders are going to want cold, hard, cash instead of their stock.',
 'But some would have us believe that we should not be in a trade war with China right now.....',
 'No, no Elon Musk did not open source their patents he offered to release their patents to anybody who would give up theirs which no other auto company did because that would be a terrible trade. Tesla wanted the manufacturing technology and processes because they have been making quite alot of mistakes in manufacturing that the rest of the auto indu

In [3]:
labelled_comments = pd.read_csv('/Users/candide/Downloads/labelled_comments.csv', encoding = "ISO-8859-1")
labelled_comments.dropna(axis=0, inplace=True)

In [10]:
len(labelled_comments[labelled_comments["positive?"] == 0.5])

9

In [5]:
for i in labelled_comments.body:
    print(i)
#     print(vader_score(i))
    print("============")

I wouldn't even buy shoes online without trying them on. 
A billionaire squeezing worker for production rather than worrying about their well-being?I am shocked.
They'd better take it seriously, these are starting to occur frequently enough to impact peoples' interest in their cars.
They're acting like an American car manufacturer
For a smart guy he is a huge dumbass.
I was offered a job with Tesla and declined. The whole vibe of the culture was creepy and cult like. They were bragging about how their top sales guy was still handling customers emails while his wife was in labor. I donÛªt even know what the salary was, I just knew it wouldnÛªt be enough.
This is the guy who tried to grandstand during the Thailand cave incident with his SpaceX submarine and when he got told it was not needed, called a rescue diver a pedo on Twitter. He may be a successful businessman and I personally like certain Tesla models but he's an asshole.
After reading this, quite frankly i think a job as a g

In [190]:
vader_score("Just look at the top comment, Musk is already guilty in the court of Reddit opinion.")

0.375

In [6]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
# include this line to filter out deleted/removed comments all retrieved
filtered_comments = [sentence for sentence in all_comments if sentence and not sentence.startswith("[deleted]") \
                     and not sentence.startswith("[removed]")]

In [23]:
def clean_sentence(sentence):
    sentence = clean_markdown(sentence)
    return sentence

In [7]:
analyzer = SentimentIntensityAnalyzer()

In [35]:
# vader sentiment score without splitting sentence
def vader_score(comment, reduce=False, binary=False):
    comment = clean_sentence(comment)
    if reduce:
        sentences = tokenizer.tokenize(comment)
        reduced_comment = []
        for sentence in sentences:
            if 'Tesla'.lower() in sentence.lower():
                reduced_comment.append(sentence)
        reduced_comment = ' '.join(reduced_comment)
        score = analyzer.polarity_scores(reduced_comment)["compound"]
    else:
        score = analyzer.polarity_scores(comment)['compound']
        
    score = (score+1)/2
    if binary:
        if score > 0.05:
            return 1
        else:
            return 0    
    else:
        return score

In [25]:
# vader sentiment score with splitting the sentence
def vader_split_score(comment):
    comment = clean_sentence(comment)
    sentences = tokenizer.tokenize(comment)
    score = 0
    for sentence in sentences:
        score += analyzer.polarity_scores(sentence)["compound"]
    score = score/len(sentences)
    score = (score+1)/2
    return score
#     if score > 0.05:
#         return 1
#     else:
#         return 0

In [49]:
# try sparknlp
pipeline = PretrainedPipeline('analyze_sentiment', 'en')

def spark_nlp_score(comment):
    sentence = clean_sentence(comment)
    result = pipeline.annotate(sentence)
    scores = [1 if x == 'positive' else -1 for x in result['sentiment']]
    score = sum(scores)
    if score >= 0:
        return 1
    else:
        return 0

analyze_sentiment download started this may take some time.
Approx size to download 4.9 MB
[OK!]


In [48]:
from textblob import TextBlob
def textblob_score(comment, binary=False):
    sentence = TextBlob(comment)
    score = sentence.sentiment.polarity
    score = (score+1)/2
    
    if binary:
        if score > 0.5:
            return 1
        else:
            return 0
        
    else:
        return score

In [27]:
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

# Instantiates a client
client = language.LanguageServiceClient()
#     print('Text: {}'.format(text))
#     print('Sentiment: {}, {}'.format(sentiment.score, sentiment.magnitude))

def google_cloud_score(text, binary=False):

    document = types.Document(
        content=text,
        type=enums.Document.Type.PLAIN_TEXT)

    # Detects the sentiment of the text
    sentiment = client.analyze_sentiment(document=document).document_sentiment
    
    score = sentiment.score
    
    if binary:
        if score > 0.6:
            return 1
        elif score <= 0.6 and score >= 0.5:
            return 0.5
        else:
            return 0
        
    else:
        return score

In [29]:
# num_correct_sparknlp = 0
# for i, row in labelled_comments.iterrows():
#     num_correct_sparknlp += 1-abs(row['Positive?'] - spark_nlp_score(row['body']))             
# print("Accuracy with SparkNLP = ", num_correct_sparknlp/51)

"""
Validate three functions now with the updated dataset Tim created:

vader without reducing sentence to entity
vader with reducing sentence to entity
Google cloud API
SparkNLP
TextBlob
Imports the Google Cloud client library
"""

mae_vader = 0
mae_vader_reduce = 0
mae_google_api = 0
mae_text_blob = 0

for i, row in labelled_comments.iterrows():
    mae_vader += abs(row['positive?'] - vader_score(row['body']))
    mae_vader_reduce += abs(row['positive?'] - vader_score(row['body'], reduce=True))
    mae_google_api += abs(row['positive?'] - google_cloud_score(row['body']))
    mae_text_blob += abs(row['positive?'] - textblob_score(row['body']))
    
print("MAE Vader = ", mae_vader/51)
print("MAE Reduce + Vader = ", mae_vader_reduce/51)
print("MAE Google API = ", mae_google_api/51)
print("MAE text_blob = ", mae_text_blob/51)

MAE Vader =  0.4234264705882353
MAE Reduce + Vader =  0.3964470588235294
MAE Google API =  0.6588235303175216
MAE text_blob =  0.41433281014425966


In [50]:
num_correct = 0
for i, row in labelled_comments.iterrows():
    num_correct += 1-abs(row['positive?'] - spark_nlp_score(row['body']))
print("Accuracy with SparkNLP = ", num_correct/len(labelled_comments))

Accuracy with SparkNLP =  0.5392156862745098


In [174]:
analyzer.polarity_scores(sentence)

{'neg': 0.0, 'neu': 0.872, 'pos': 0.128, 'compound': 0.5511}

In [36]:
import spacy

In [178]:
fb_comments_df = query_on_reddit_bigquery("facebook")

In [41]:
def is_about_entity(sentence):
    nlp = spacy.load("./spacy_company/")
    doc = nlp(sentence)

    for ent in doc.ents:
        if ent.label_ == 'COMPANY':
            return True
    
    return False

In [42]:
is_about_entity("I came across this great article about green tea on facebook")

False