# Imports

In [None]:
!gcloud dataproc clusters list --region us-central1

In [None]:
!pip install -q google-cloud-storage==1.43.0
!pip install -q graphframes

In [None]:
%cd -q /home/dataproc
!ls inverted_index_gcp.py
from inverted_index_gcp import InvertedIndex

In [None]:
# These will already be installed in the testing environment so disregard the 
# amount of time (~1 minute) it takes to install. 
!pip install -q pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
!pip install -q graphframes



import pyspark
import sys
from collections import Counter, OrderedDict, defaultdict
import itertools
from itertools import islice, count, groupby
import pandas as pd
import os
import re
from operator import itemgetter
import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords
from time import time
from pathlib import Path
import pickle
import numpy as np
import pandas as pd
import math
from functools import reduce
from google.cloud import storage
from inverted_index_gcp import *


import hashlib
def _hash(s):
    return hashlib.blake2b(bytes(s, encoding='utf8'), digest_size=5).hexdigest()

# ''' code addition'''
# from flask import Flask, request, jsonify
# ''' code addition'''

nltk.download('stopwords')

In [None]:
# Stopwords
english_stopwords = frozenset(stopwords.words('english'))
corpus_stopwords = ["category", "references", "also", "external", "links", 
                    "may", "first", "see", "history", "people", "one", "two", 
                    "part", "thumb", "including", "second", "following", 
                    "many", "however", "would", "became"]
RE_WORD = re.compile(r"""[\#\@\w](['\-]?\w){2,24}""", re.UNICODE)
all_stopwords = english_stopwords.union(corpus_stopwords)

In [None]:
paths=[]
# client = storage.Client()
bucket_name = "sean_bucket_anchor"
full_path = f"gs://{bucket_name}/"

blobs = client.list_blobs(bucket_name)

for b in blobs:
    if b.name.endswith("parquet"):
        paths.append(full_path+b.name)

# Page rank upload

In [None]:
def generate_graph(pages):
  edges = pages.flatMap(lambda x: map(lambda y: (x[0], y[0]), x[1])).distinct()
  vertices = edges.flatMap(lambda x: x).distinct().map(lambda x:(x,))
  return edges, vertices

In [None]:
parquetFile = spark.read.parquet(*paths)
# pages_links = spark.read.parquet("gs://wikidata20210801_preprocessed/*").select("id", "anchor_text").rdd
pages_links = parquetFile.select("id", "anchor_text").rdd
# construct the graph 
edges, vertices = generate_graph(pages_links)
# compute PageRank
edgesDF = edges.toDF(['src', 'dst']).repartition(124, 'src')
verticesDF = vertices.toDF(['id']).repartition(124, 'id')
g = GraphFrame(verticesDF, edgesDF)
pr_results = g.pageRank(resetProbability=0.15, maxIter=6)
pr = pr_results.vertices.select("id", "pagerank")
pr = pr.sort(col('pagerank').desc())

In [None]:
# Store the page rank in bucket
pandas_df = pr.toPandas()
storage_client = storage.Client()
bucket_name = 'sean_bucket_title'
file_name = 'page_rank.pkl'
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(file_name)

# Save the Pandas DataFrame to a pickle file
with open('page_rank.pkl', 'wb') as f:
    pickle.dump(pandas_df, f)

# upload pickle file to GCS
blob.upload_from_file(open('page_rank.pkl', 'rb'))

In [None]:
# load the pickle file from the bucket
pr_dct = pickle.loads(bucket.get_blob('pageranks_dict.pkl').download_as_string())

# Page view upload

In [None]:
pv_path = 'https://dumps.wikimedia.org/other/pageview_complete/monthly/2021/2021-08/pageviews-202108-user.bz2'
p = Path(pv_path) 
pv_name = p.name
pv_temp = f'{p.stem}-4dedup.txt'
pv_clean = f'{p.stem}.pkl'
# Download the file (2.3GB) 
!wget -N $pv_path
# Filter for English pages, and keep just two fields: article ID (3) and monthly 
# total number of page views (5). Then, remove lines with article id or page 
# view values that are not a sequence of digits.
!bzcat $pv_name | grep "^en\.wikipedia" | cut -d' ' -f3,5 | grep -P "^\d+\s\d+$" > $pv_temp
# Create a Counter (dictionary) that sums up the pages views for the same 
# article, resulting in a mapping from article id to total page views.
wid2pv = Counter()
with open(pv_temp, 'rt') as f:
  for line in f:
    parts = line.split(' ')
    wid2pv.update({int(parts[0]): int(parts[1])})
# write out the counter as binary file (pickle it)
with open(pv_clean, 'wb') as f:
  pickle.dump(wid2pv, f)
# read in the counter
with open(pv_clean, 'rb') as f:
  wid2pv = pickle.loads(f.read())

In [None]:
# upload into title bucket our pr
storage_client = storage.Client()
bucket_name = 'sean_bucket_title'
bucket = storage_client.get_bucket(bucket_name)
pv = pickle.loads(bucket.get_blob('page_view.pkl').download_as_string())

## Max pv = 181126232 
## Max pr = 9913.728782160779