# Setup spark

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] ="--conf spark.driver.memory=2g  pyspark-shell"

from pyspark.sql import SparkSession
from pyspark.sql.functions import flatten
from pyspark.sql.types import (StructType, StructField, StringType, 
                                FloatType, DateType, IntegerType, ArrayType)
spark = SparkSession \
        .builder \
        .master("local[*]") \
        .appName("BDA assignment") \
        .getOrCreate()

# Imports

In [2]:
from typing import NamedTuple, Final, List
#from lxml import etree
import xml.etree.ElementTree as ET
from itertools import islice, chain, combinations
import argparse
import traceback
import bleach
import html
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import pr
import string
import random
import hashlib
import pandas as pd
import numpy as np
import hashlib
import json
import sys

# Constants

In [3]:
SHINGLE_SIZE: Final = 5
SAMPLES: Final = 1000

In [4]:
class comment_tuple(NamedTuple):
    id: int
    #owner_id: int
    post_type: int
    score: int
    text: str

class shingle_set(NamedTuple):
    id: int
    shingles: frozenset[tuple]

class similarity(NamedTuple):
    id_set1: int
    id_set2: int
    similarity: float

# Read and clean XML

In [5]:
def set_schema():
    """
    Define the schema for the DataFrame
    """
    schema_list = []
    schema_list.append(StructField("Id", IntegerType(), True))
    #schema_list.append(StructField("PostTypeId", IntegerType(), True))
    #schema_list.append(StructField("Score", IntegerType(), True))
    schema_list.append(StructField("Body", StringType(), True))
    
    return StructType(schema_list)

def parse_post(rdd):
    results = []
    root = ET.fromstring(rdd[0])

    for elem in root.findall('row'):
        rec = []
        #print("Found row")
        assert elem.text is None, "The row wasn't empty"
        rec.append(int(elem.attrib["Id"]))
        #int(elem.attrib["OwnerUserId"]),
        #rec.append(int(elem.attrib["PostTypeId"])),
        #rec.append(int(elem.attrib["Score"])),
        rec.append(bleach.clean(elem.attrib["Body"], strip=True))
        #rec.append(elem.attrib["Body"])

        #elem.clear()
        #while elem.getprevious() is not None:
        #    del elem.getparent()[0]
        results.append(rec)

    return results

In [6]:
filename = "cstheory_posts.bakxml"
chunksize = 1024

file_rdd = spark.read.text(filename, wholetext=True).rdd
dataset = file_rdd.flatMap(parse_post)

In [7]:
dataset.persist()
schema = set_schema()
df_ds = dataset.toDF(schema)
df_ds.show()

#df_posts = records_rdd.toDF(schema)
#coll = records_rdd.collect()

+---+--------------------+
| Id|                Body|
+---+--------------------+
|  2|I have a dataset ...|
|  3|A particular prog...|
|  4|What is the follo...|
|  5|Can the divide an...|
|  6|Is anyone aware o...|
|  7|In general, the q...|
|  8|If I understand t...|
|  9|<a href="http://w...|
| 10|There was recentl...|
| 11|Functional progra...|
| 12|I took a class on...|
| 13|It's possible tha...|
| 14|Other than going ...|
| 15|[This question ha...|
| 16|See the <a href="...|
| 17|it is often said ...|
| 18|In one word: No.
...|
| 19|In short, I would...|
| 20|It's easy to prov...|
| 21|Any &quot;cutting...|
+---+--------------------+
only showing top 20 rows



# Shingling

In [8]:
class shingler:
    """
    Class that contain a tokenizer and stopwords to make shingling easier.
    """
    def __init__(self):
        self.stopwords = set(stopwords.words('english'))

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        return True

    def tokenize(self, text: str) -> List[str]:
        text = text.translate(str.maketrans('', '', string.punctuation))
        # text_nop = text.split()
        text_nop = word_tokenize(text)
        filtered_words = []

        for word in text_nop:
            if word not in self.stopwords:
                filtered_words.append(word.lower())
        
        return filtered_words

    #def create_shingle(self, input_comment: comment_tuple, shingle_size: int) -> frozenset[tuple]:
    #    tokens = self.tokenize(input_comment.text)
    #    comment_length = len(tokens)
    #    shingles =  frozenset(tuple(tokens[i:(i + shingle_size)]) for i in range(comment_length - shingle_size + 1))
    
    def create_shingle(self, post: str, shingle_size: int) -> list[list]:
        tokens = self.tokenize(post)
        comment_length = len(tokens)
        shingle_set = frozenset(tuple(tokens[i:(i + shingle_size)]) for i in range(comment_length - shingle_size + 1))
        shingle_list = list(shingle_set)
        #for elem in shingle_list:
        #    elem.sort()
        shingle_list.sort()
        return list(shingle_set)
        

def shingle_map(row):
    ds_shingler = shingler()
    return (row[0], ds_shingler.create_shingle(row[1], SHINGLE_SIZE)
)

def set_shingle_schema():
    """
    Define the schema for the DataFrame
    """
    schema_list = []
    schema_list.append(StructField("Id", IntegerType(), True))
    #schema_list.append(StructField("PostTypeId", IntegerType(), True))
    #schema_list.append(StructField("Score", IntegerType(), True))
    schema_list.append(StructField("Shingles", ArrayType(ArrayType(StringType()), True)))
    return StructType(schema_list)

In [10]:
schema = set_shingle_schema()
shingle_rdd = dataset.map(shingle_map)
#df_shingle = shingle_rdd.toDF(schema)
#df_shingle.show()

# MinHashing

In [26]:
# Transform posts to characteristic matrix
# Make feature set matrix
# Minhash
# Make Minhash Matrix
# LSH
import random
SIGNATURE_SIZE: Final = 50
HASH_PRIME: Final = (1 << 31) - 1
MAX_HASH: Final = (1 << 32)
    #sys.maxsize
HASH_RANGE: Final = (1<< 32)
    #sys.maxsize
SEED: Final = 193120

generator = np.random.RandomState(SEED)
salts = [generator.randint(1, MAX_HASH) for _ in range(SIGNATURE_SIZE)]
#permutations = [generator.randint(1, HASH_PRIME) for _ in range(SIGNATURE_SIZE)]

permutations = [[generator.randint(1, MAX_HASH) for _ in range(SIGNATURE_SIZE)],
                [generator.randint(0, MAX_HASH) for _ in range(SIGNATURE_SIZE)]]

print(salts)
print(permutations)

def hash_func(data, p1, p2):
    
    return int.from_bytes(hashlib.md5(int.to_bytes(salt, 8, byteorder="big") + json.dumps(data).encode()).digest()[:8], byteorder="big")
    
def min_hasher(row):
    sig = np.full((SIGNATURE_SIZE), MAX_HASH)
    for shingle in row[1]:
        for i in range(SIGNATURE_SIZE):
            a = permutations[0][i]
            b = permutations[1][i]
            hash_val = (a * hash(shingle) + b) % HASH_PRIME
            #hash_func(tuple(shingle), salts[i]) % HASH_PRIME
            sig[i] = min(hash_val, sig[i])
    return (row[0], sig)

[2098660572, 74013022, 2675907171, 2481944613, 2289028529, 846582864, 1024003631, 3457496236, 958218556, 2872479854, 1385329197, 2720560315, 2596604670, 4118717270, 3831528778, 4184700433, 130382780, 4221132295, 2492231677, 1349853675, 2674011478, 4230155413, 522369175, 1349933754, 2597169981, 1045439184, 3199517916, 3020468163, 618450142, 3282454786, 4061764145, 3477766529, 3055070885, 204747729, 834378094, 4046990333, 2141660100, 2118590568, 3537961224, 168082228, 652073352, 1284401985, 2826423363, 1325707607, 3731067674, 723716237, 3333080471, 2787841228, 2113795590, 3230829213]
[[1843969928, 746019385, 2309594694, 2229839123, 489336389, 779150137, 1446103079, 889348879, 3745936412, 4281307413, 2173665908, 1717438567, 23839969, 2145022294, 1052302893, 1118678081, 1295195691, 3851408124, 3928788718, 2370966349, 561561790, 1051546482, 4224882136, 4206633849, 1329396735, 1801554849, 1138195103, 3123692201, 1122989246, 1270364612, 1593839649, 1463959047, 4229718824, 1709163888, 33163415

In [27]:
a = permutations[0][1]
b = permutations[1][1]

hash_val = (a * hash(tuple(["text", "word", "shingle", "advanced"]))+ b) % HASH_PRIME
print(hash_val)

253594894


In [28]:
hash_rdd = shingle_rdd.map(min_hasher)

for elem in hash_rdd.take(3):
    print(elem)

(2, array([  2642774,  34835332,  73782674,  21266037,  43363373,  63375525,
        64780483, 221547471,  36363255,  51638969,  76963240,  51866191,
        67658569,  22051358,  10384782,  11787093,  11974757,   4601448,
        48843986,  91175009,  39248163,  13055715, 182395884,  96883907,
        28649451,  65482425,  26380695,  40817098,  58945925,  24172262,
        41336337,   2597183,  86945717,  20692157, 100898042,  60416914,
        50895349,  95380440,  17940440,  40390223,  96414874,  21914682,
         8816484,     42951,  39669483,  70060900, 111568449,  38701334,
        21925107, 111343460]))
(3, array([   864466,  52734412,  39592123,  28720135,  53387950,  50144848,
        23262955,  76419385,  81890689,  29974913,  31177450,  89574785,
        91512184,   5498854,  10541739,   5555579,   4679775,  18672544,
       124799783,  15602970,  41537678,  97334216,   2753034,  29277312,
        23256660,  43841607,   3479836,  41146054,  41769611,  38661388,
        6126

# LSH

In [32]:
BANDS: Final = 10
ROWS: Final = 5
THRESHOLD: Final = (1/BANDS) ** (1/ROWS)
print(f"Bands: {BANDS}, rows {ROWS}, threshold {THRESHOLD}")

# returns (doc, band, hash)
hash_band_rdd = hash_rdd.flatMap(lambda x: [[(x[0], i % BANDS), hash] for i, hash in enumerate(x[1])])

for elem in hash_band_rdd.take(51):
    print(elem)

Bands: 10, rows 5, threshold 0.6309573444801932
[(2, 0), 2642774]
[(2, 1), 34835332]
[(2, 2), 73782674]
[(2, 3), 21266037]
[(2, 4), 43363373]
[(2, 5), 63375525]
[(2, 6), 64780483]
[(2, 7), 221547471]
[(2, 8), 36363255]
[(2, 9), 51638969]
[(2, 0), 76963240]
[(2, 1), 51866191]
[(2, 2), 67658569]
[(2, 3), 22051358]
[(2, 4), 10384782]
[(2, 5), 11787093]
[(2, 6), 11974757]
[(2, 7), 4601448]
[(2, 8), 48843986]
[(2, 9), 91175009]
[(2, 0), 39248163]
[(2, 1), 13055715]
[(2, 2), 182395884]
[(2, 3), 96883907]
[(2, 4), 28649451]
[(2, 5), 65482425]
[(2, 6), 26380695]
[(2, 7), 40817098]
[(2, 8), 58945925]
[(2, 9), 24172262]
[(2, 0), 41336337]
[(2, 1), 2597183]
[(2, 2), 86945717]
[(2, 3), 20692157]
[(2, 4), 100898042]
[(2, 5), 60416914]
[(2, 6), 50895349]
[(2, 7), 95380440]
[(2, 8), 17940440]
[(2, 9), 40390223]
[(2, 0), 96414874]
[(2, 1), 21914682]
[(2, 2), 8816484]
[(2, 3), 42951]
[(2, 4), 39669483]
[(2, 5), 70060900]
[(2, 6), 111568449]
[(2, 7), 38701334]
[(2, 8), 21925107]
[(2, 9), 111343460]
[(3,

In [None]:
bands_rdd = hash_band_rdd.map(lambda x: [x[0][1], (x[0][0], x[1])]).groupByKey()

for elem in bands_rdd.take(1):
    print(elem)

# Exit Spark

In [None]:
#spark.stop()