In [1]:
import os

os.environ['PYWIKIBOT_DIR'] = './wiki_reader/'

In [2]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql.functions import udf
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructField, StructType, ArrayType, LongType, DoubleType

from spark_app.scorers import score_text, vader_scorer
from spark_app.spark_tools import SparkSentimentStreamer
from pathlib import Path

from ml.OutputProcessing import process_output_files

## Input

In [3]:
request = 'peano'

is_category = False # Set it to True if you want to switch search mode from "containing word" to "all pages in category"
batch_size = 50 # Amount of pages in one batch
limit = 1000 # Maximum amount of pages to process, set to None if unlimited
debug_info = False # Set it to True if you want 

wiki_page_dir = 'requests/'
spark_results_dir = 'responses/'

## Processing

In [4]:
def cleanup(path):
    path = Path(path)
    path.mkdir(parents=True, exist_ok=True)
    for x in path.iterdir():
        if x.is_file():
            x.unlink()
        else:
            cleanup(x)
            x.rmdir()

In [5]:
def spark_process(request, score_func, size):
    data_in = wiki_page_dir + request
    data_out = spark_results_dir + request
    cleanup(data_out)
    
    spark = SparkSession.builder\
        .master("local[*]")\
        .appName("NetworkWordCount")\
        .getOrCreate()
    
    sc = spark.sparkContext
    ssc = StreamingContext(sc, 1)    
    
    streamer = SparkSentimentStreamer(sc,
                                      ssc,
                                      spark,
                                      score_func,
                                      data_in,
                                      data_out,
                                      size,
                                      debug_info=debug_info)
    streamer.run()

In [6]:
from concurrent.futures import ThreadPoolExecutor
import wiki_reader.reader as reader

query_size = reader.query_size(request, limit)
preload_content = query_size <= 1500  # Set it True when you processing up to ~1500 pages to speed up process

wiki_wrapper = lambda r,b,l,cat,cont: reader.query(r,out_dir=wiki_page_dir,batch_size=b,debug_info=debug_info,
                                              limit=l,is_category=cat,preload_content=cont)

In [7]:
with ThreadPoolExecutor(max_workers=4) as e:
    e.submit(wiki_wrapper, request, batch_size, limit, is_category, preload_content)
    e.submit(spark_process, request, vader_scorer, query_size)
    e.submit(process_output_files, spark_results_dir, query_size)

IntProgress(value=491, description='Iter 491/491', max=491)

'Mean: 0.692'

Unnamed: 0,title,url,sentiment
0,First-order logic,https://en.wikipedia.org/wiki/First-order_logic,1.0000
1,List of people from Central Italy,https://en.wikipedia.org/wiki/List_of_people_f...,1.0000
2,List of mathematical shapes,https://en.wikipedia.org/wiki/List_of_mathemat...,1.0000
3,Meanings of minor planet names: 9001–10000,https://en.wikipedia.org/wiki/Meanings_of_mino...,1.0000
4,Culture of Italy,https://en.wikipedia.org/wiki/Culture_of_Italy,1.0000
5,Romance languages,https://en.wikipedia.org/wiki/Romance_languages,1.0000
6,List of people from Italy,https://en.wikipedia.org/wiki/List_of_people_f...,1.0000
7,List of Occitans,https://en.wikipedia.org/wiki/List_of_Occitans,1.0000
8,Charles Sanders Peirce,https://en.wikipedia.org/wiki/Charles_Sanders_...,0.9999
9,Implementation of mathematics in set theory,https://en.wikipedia.org/wiki/Implementation_o...,0.9999
