In [2]:
import os

os.environ['PYWIKIBOT_DIR'] = './wiki_reader/'

In [None]:
import sparknlp
from pyspark.sql import SQLContext

spark = sparknlp.start()
sc = spark.sparkContext
sqlCtx = SQLContext(sc)

In [19]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql.functions import udf
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructField, StructType, ArrayType, LongType, DoubleType

from spark_app.scorers import score_text
from spark_app.spark_tools import SparkSentimentStreamer
from pathlib import Path

## Input

In [21]:
request = 'peano'

batch_size = 100
limit = None
preload_content = True
is_category = False

## Processing

In [None]:
from ml.LogisticRegressionCached import readFromCache

(lrModel, pipelineModel) = readFromCache('./ml/train/')

def score_text_ml(text):
    df = spark.createDataFrame([(text, 2)], ['text', 'target'])
    df_transformed = pipelineModel.transform(df) # To fix
    predictions = lrModel.transform(df_transformed)
    predictions = predictions.select(['text', 'probability', 'prediction'])
    pd_predictions = predictions.toPandas()
    positive_probability = pd_predictions.iloc[0]['probability'][1]
    overall_probability = 2 * positive_probability - 1
    
    return overall_probability

In [5]:
def spark_process(request, score_func):
    sc = SparkContext("local[*]", "NetworkWordCount")
    ssc = StreamingContext(sc, 1)
    spark = SparkSession \
        .builder \
        .appName("SentimentWikiProcessor") \
        .getOrCreate()
    
    dataInp = "requests/" + request
    dataOut = "responses/" + request
    Path(dataOut).mkdir(parents=True)
    
    streamer = SparkSentimentStreamer(sc, ssc, spark, score_func, dataInp, dataOut)
    streamer.run()
    streamer.stop()

In [4]:
from concurrent.futures import ThreadPoolExecutor
import wiki_reader.reader as reader

wrapper = lambda r,b,l,cat,cont: reader.query(r,batch_size=b,limit=l,is_category=cat,preload_content=cont)

with ThreadPoolExecutor(max_workers=2) as e:
    e.submit(wrapper, 'peano', batch_size, limit, is_category, preload_content)
    e.submit(spark_process, 'peano', score_text)

Cleaning old data
Dumped 100 pages | peano
Dumped 200 pages | peano
Dumped 300 pages | peano
Dumped 400 pages | peano
Dumped 491 pages | peano
