In [34]:
import pandas as pd
import os
import json
from nltk import tokenize

import numpy as np

import sparknlp
from pyspark.sql import SQLContext

import sys
# dirty hack
sys.path.insert(0, '../ml/')
from LogisticRegressionCached import readFromCache




In [4]:
spark = sparknlp.start()
sc = spark.sparkContext
sqlCtx = SQLContext(sc)

In [27]:
def scorer_wrapper(f):
    def score_by_sentence(text):
        sentences = tokenize.sent_tokenize(text)
        return np.mean([ f(s) for s in sentences])
    return score_by_sentence

In [28]:
(lrModel, pipelineModel) = readFromCache('../ml/train/')

@scorer_wrapper
def score_text(text):
    df = spark.createDataFrame([(text, 2)], ['text', 'target'])
    df_transformed = pipelineModel.transform(df) # To fix
    predictions = lrModel.transform(df_transformed)
    predictions = predictions.select(['text', 'probability', 'prediction'])
    pd_predictions = predictions.toPandas()
    positive_probability = pd_predictions.iloc[0]['probability'][1]
    overall_probability = 2 * positive_probability - 1
    
    return overall_probability

models loaded


## 1. Vocabulary-based sentiment scoring will be scored for the query "Amazon Company"
## Positive and negative paragraphs are defined

In [29]:
query = "Amazon Company"

negative_paragraph = """{0} is very bad. And author doesn't provide any justification. People don't like {0}. 
Some even hate {0}, because {0} is evil. Some groups believe {0} is their main enemy.
""".format(query)

positive_paragraph = """{0} is very good. And author doesn't provide any justification. People like {0}. 
Some even love {0}, because {0} is honest. Some groups believe {0} is their best friend.""".format(query)

negative_paragraph

"Amazon Company is very bad. And author doesn't provide any justification. People don't like Amazon Company. \nSome even hate Amazon Company, because Amazon Company is evil. Some groups believe Amazon Company is their main enemy.\n"

## 2. For the sake of simplicity top 5 relevant articles were retrieved.
## These articles are read.

In [30]:
folder = "./amazon_company/"
files = os.listdir(folder)

In [31]:
df = pd.DataFrame(index=list(range(0,len(files))), columns=['title', 'link', 'text'])

In [32]:
for i, filename in enumerate(files):
    with open(folder + filename, 'r') as f:
        data = json.load(f)
        df.loc[i] = data['title'], data['link'], data['text']

## 3. Then these articles are scored and ranking is built

In [35]:
ranking = df.copy()
ranking['score'] = df['text'].apply(score_text)
ranking = ranking.sort_values(by='score', ascending=False)
ranking

Unnamed: 0,title,link,text,score
1,Amazon (company),https://en.wikipedia.org/wiki/Amazon_(company),"Amazon (company)\nAmazon.com, Inc.[6] (/ˈæməzɒ...",0.428397
2,Prime Video,https://en.wikipedia.org/wiki/Prime_Video,"Prime Video, also marketed as Amazon Prime Vid...",0.355471
0,History of Amazon,https://en.wikipedia.org/wiki/History_of_Amazon,Founding\nThe company was founded as a result ...,0.321414
3,Amazon Web Services,https://en.wikipedia.org/wiki/Amazon_Web_Services,Amazon Web Services (AWS) is a subsidiary of A...,0.193755
4,Amazon Prime,https://en.wikipedia.org/wiki/Amazon_Prime,Amazon Prime is a paid subscription service of...,0.177118


## 4. Article with median score is choosen for reference

In [36]:
reference = ranking.iloc[len(files)//2]
reference

title                                    History of Amazon
link       https://en.wikipedia.org/wiki/History_of_Amazon
text     Founding\nThe company was founded as a result ...
score                                             0.321414
Name: 0, dtype: object

## 5. Two copies are added with positive/negative editions in article text

In [37]:
df_extended = df.copy()
df_extended

Unnamed: 0,title,link,text
0,History of Amazon,https://en.wikipedia.org/wiki/History_of_Amazon,Founding\nThe company was founded as a result ...
1,Amazon (company),https://en.wikipedia.org/wiki/Amazon_(company),"Amazon (company)\nAmazon.com, Inc.[6] (/ˈæməzɒ..."
2,Prime Video,https://en.wikipedia.org/wiki/Prime_Video,"Prime Video, also marketed as Amazon Prime Vid..."
3,Amazon Web Services,https://en.wikipedia.org/wiki/Amazon_Web_Services,Amazon Web Services (AWS) is a subsidiary of A...
4,Amazon Prime,https://en.wikipedia.org/wiki/Amazon_Prime,Amazon Prime is a paid subscription service of...


In [38]:
df_extended = df_extended.append(pd.DataFrame(data={'title':['neg_edit', 'pos_edit'],
                                     'link':['',''], 
                                     'text':[reference['text'] + negative_paragraph,
                                            reference['text'] + positive_paragraph]}))
df_extended

Unnamed: 0,title,link,text
0,History of Amazon,https://en.wikipedia.org/wiki/History_of_Amazon,Founding\nThe company was founded as a result ...
1,Amazon (company),https://en.wikipedia.org/wiki/Amazon_(company),"Amazon (company)\nAmazon.com, Inc.[6] (/ˈæməzɒ..."
2,Prime Video,https://en.wikipedia.org/wiki/Prime_Video,"Prime Video, also marketed as Amazon Prime Vid..."
3,Amazon Web Services,https://en.wikipedia.org/wiki/Amazon_Web_Services,Amazon Web Services (AWS) is a subsidiary of A...
4,Amazon Prime,https://en.wikipedia.org/wiki/Amazon_Prime,Amazon Prime is a paid subscription service of...
0,neg_edit,,Founding\nThe company was founded as a result ...
1,pos_edit,,Founding\nThe company was founded as a result ...


## 6. New ranking is built

In [39]:
ranking = df_extended.copy()
ranking['score'] = df_extended['text'].apply(score_text)
ranking = ranking.sort_values(by='score', ascending=False)
ranking

Unnamed: 0,title,link,text,score
1,Amazon (company),https://en.wikipedia.org/wiki/Amazon_(company),"Amazon (company)\nAmazon.com, Inc.[6] (/ˈæməzɒ...",0.428397
2,Prime Video,https://en.wikipedia.org/wiki/Prime_Video,"Prime Video, also marketed as Amazon Prime Vid...",0.355471
1,pos_edit,,Founding\nThe company was founded as a result ...,0.330284
0,History of Amazon,https://en.wikipedia.org/wiki/History_of_Amazon,Founding\nThe company was founded as a result ...,0.321414
0,neg_edit,,Founding\nThe company was founded as a result ...,0.300001
3,Amazon Web Services,https://en.wikipedia.org/wiki/Amazon_Web_Services,Amazon Web Services (AWS) is a subsidiary of A...,0.193755
4,Amazon Prime,https://en.wikipedia.org/wiki/Amazon_Prime,Amazon Prime is a paid subscription service of...,0.177118


## 7. We observe, that edited articles are ranked as expected: negative is lower, positive is higher in the ranking.