# Setup spark

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] ="pyspark-shell"

from pyspark.sql import SparkSession
from pyspark.sql.functions import flatten
from pyspark.sql.types import (StructType, StructField, StringType, 
                                FloatType, DateType, IntegerType, ArrayType)
spark = SparkSession \
        .builder \
        .master("local[*]") \
        .config("spark.driver.memory", "15g") \
        .appName("BDA assignment") \
        .getOrCreate()

# Imports

In [2]:
from typing import NamedTuple, Final, List
#from lxml import etree
import xml.etree.ElementTree as ET
from itertools import islice, chain, combinations
import argparse
import traceback
import bleach
import html
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import pr
#from pyspark.ml.feature import StopWordsRemover
import string
import random
import hashlib
import pandas as pd
import numpy as np
import hashlib
import json
import sys
from itertools import islice

# Constants

In [3]:
SHINGLE_SIZE: Final = 5
SAMPLES: Final = 1000

In [4]:
class comment_tuple(NamedTuple):
    id: int
    #owner_id: int
    post_type: int
    score: int
    text: str

class shingle_set(NamedTuple):
    id: int
    shingles: frozenset[tuple]

class similarity(NamedTuple):
    id_set1: int
    id_set2: int
    similarity: float

# Read and clean XML

In [5]:
def set_schema():
    """
    Define the schema for the DataFrame
    """
    schema_list = []
    schema_list.append(StructField("Id", IntegerType(), True))
    #schema_list.append(StructField("PostTypeId", IntegerType(), True))
    #schema_list.append(StructField("Score", IntegerType(), True))
    schema_list.append(StructField("Body", StringType(), True))
    
    return StructType(schema_list)

def parse_post(rdd):
    results = []
    root = ET.fromstring(rdd[0])

    for elem in root.findall('row'):
        rec = []
        #print("Found row")
        assert elem.text is None, "The row wasn't empty"
        rec.append(int(elem.attrib["Id"]))
        #int(elem.attrib["OwnerUserId"]),
        #rec.append(int(elem.attrib["PostTypeId"])),
        #rec.append(int(elem.attrib["Score"])),
        rec.append(bleach.clean(elem.attrib["Body"], strip=True))
        #rec.append(elem.attrib["Body"])

        #elem.clear()
        #while elem.getprevious() is not None:
        #    del elem.getparent()[0]
        results.append(rec)

    return results

In [6]:
filename = "sofen_posts.xml"
chunksize = 1024

file_rdd = spark.read.text(filename, wholetext=True).rdd
dataset = file_rdd.flatMap(parse_post)

# Shingling

In [7]:
STOPWORDS: Final = stopwords.words('english')

def tokenize(text: str) -> List[str]:
    text = text.translate(str.maketrans('', '', string.punctuation))
    # text_nop = text.split()
    text_nop = word_tokenize(text)
    filtered_words = []

    for word in text_nop:
        if word not in STOPWORDS:
            filtered_words.append(word.lower())
        
    return filtered_words

    #def create_shingle(self, input_comment: comment_tuple, shingle_size: int) -> frozenset[tuple]:
    #    tokens = self.tokenize(input_comment.text)
    #    comment_length = len(tokens)
    #    shingles =  frozenset(tuple(tokens[i:(i + shingle_size)]) for i in range(comment_length - shingle_size + 1))
    
def create_shingle(post: str, shingle_size: int) -> list[list]:
    tokens = tokenize(post)
    comment_length = len(tokens)
    shingle_set = frozenset(hash(tuple(tokens[i:(i + shingle_size - 1)])) for i in range(comment_length - shingle_size))
    shingle_list = list(shingle_set)
    shingle_list.sort()
    return shingle_list
        

def shingle_map(row):
    return (row[0], create_shingle(row[1], SHINGLE_SIZE)
)

def set_shingle_schema():
    """
    Define the schema for the DataFrame
    """
    schema_list = []
    schema_list.append(StructField("Id", IntegerType(), True))
    #schema_list.append(StructField("PostTypeId", IntegerType(), True))
    #schema_list.append(StructField("Score", IntegerType(), True))
    schema_list.append(StructField("Shingles", ArrayType(ArrayType(StringType()), True)))
    return StructType(schema_list)

In [8]:
schema = set_shingle_schema()
shingle_rdd = dataset.map(shingle_map)

for elem in shingle_rdd.take(1):
    print(elem)

(1, [-7908626704179410037, -4684397193795377420, -3850838529138350569, -1419329986015534448, -473890435429154670, 1666811628384754881, 1736049749176541686, 2043274720988062366, 3773704842148420615, 6410835021690098780, 7162646048271668292, 7303886912402816729, 8156242454416184963])


# MinHashing

In [9]:
# Transform posts to characteristic matrix
# Make feature set matrix
# Minhash
# Make Minhash Matrix
# LSH
import random
SIGNATURE_SIZE: Final = 108
HASH_PRIME: Final = (1 << 31) - 1
MAX_HASH: Final = (1 << 32)
    #sys.maxsize
HASH_RANGE: Final = (1<< 32)
    #sys.maxsize
SEED: Final = 193120

generator = np.random.RandomState(SEED)
salts = [generator.randint(1, MAX_HASH) for _ in range(SIGNATURE_SIZE)]
#permutations = [generator.randint(1, HASH_PRIME) for _ in range(SIGNATURE_SIZE)]

permutations = [[generator.randint(1, MAX_HASH) for _ in range(SIGNATURE_SIZE)],
                [generator.randint(0, MAX_HASH) for _ in range(SIGNATURE_SIZE)]]

print(salts)
print(permutations)
    
def min_hasher(row):
    sig = np.full((SIGNATURE_SIZE), MAX_HASH)
    for shingle in row[1]:
        for i in range(SIGNATURE_SIZE):
            a = permutations[0][i]
            b = permutations[1][i]
            hash_val = (a * shingle + b) % HASH_PRIME
            sig[i] = min(hash_val, sig[i])
    return (row[0], sig)

[2098660572, 74013022, 2675907171, 2481944613, 2289028529, 846582864, 1024003631, 3457496236, 958218556, 2872479854, 1385329197, 2720560315, 2596604670, 4118717270, 3831528778, 4184700433, 130382780, 4221132295, 2492231677, 1349853675, 2674011478, 4230155413, 522369175, 1349933754, 2597169981, 1045439184, 3199517916, 3020468163, 618450142, 3282454786, 4061764145, 3477766529, 3055070885, 204747729, 834378094, 4046990333, 2141660100, 2118590568, 3537961224, 168082228, 652073352, 1284401985, 2826423363, 1325707607, 3731067674, 723716237, 3333080471, 2787841228, 2113795590, 3230829213, 1843969928, 746019385, 2309594694, 2229839123, 489336389, 779150137, 1446103079, 889348879, 3745936412, 4281307413, 2173665908, 1717438567, 23839969, 2145022294, 1052302893, 1118678081, 1295195691, 3851408124, 3928788718, 2370966349, 561561790, 1051546482, 4224882136, 4206633849, 1329396735, 1801554849, 1138195103, 3123692201, 1122989246, 1270364612, 1593839649, 1463959047, 4229718824, 1709163888, 33163415, 

In [10]:
a = permutations[0][1]
b = permutations[1][1]

hash_val = (a * hash(tuple(["text", "word", "shingle", "advanced"]))+ b) % HASH_PRIME
print(hash_val)

hash_val = (a * hash(tuple(["text", "word", "shingle", "advanced"]))+ b) % HASH_PRIME
print(hash_val)

912218934
912218934


In [11]:
hash_rdd = shingle_rdd.map(min_hasher)

for elem in hash_rdd.take(3):
    print(elem)

(1, array([ 380705819,  206310711,  169397232,  242805879,  196719985,
        120780423,    5267621,  122779738,  213707207,  117816972,
         10403537,  140839399,   66796164,    2484015,  215671674,
         99910673,   72982951,    7756024,  191319146,   14268963,
        342049474,  144153609,  154163992,   94071305,    6198873,
         34839679,    1968447,   86450318,  228552477,   57559885,
          4338103,   11708959,  445576657,  137142375,   40891429,
        329415997,  120576054,  105621777,   22562388,   11522930,
         12103337,   95570238,  302486835,   79590142,   64544848,
         49277333,  171433795,  304821042,  127147998,  476383995,
        356733444,  336888445,    7561901,    3583569,   33775729,
        120790216,  520321546,  128893385,  398105571,  121249961,
        114673920,    9917760,  206987876,   26849313,  126021886,
        129517806,  129614437,   31160851,  150483834, 1155766112,
         23797721,   76232838,  231080316,  199437210,  40

# LSH

In [12]:
BANDS: Final = 18
ROWS: Final = 6
THRESHOLD: Final = (1/BANDS) ** (1/ROWS)
print(f"Bands: {BANDS}, rows {ROWS}, threshold {THRESHOLD}")

def hash_func(row):
    sum = 0
    for e in row[1][0]:
        sum += e
    return (row[0], (int.from_bytes(hashlib.md5(str(sum).encode()).digest()[:4], byteorder="big"), row[1][1]))

Bands: 18, rows 6, threshold 0.6177146705271326


In [13]:
# returns (doc, band, hash)
hash_band_rdd = hash_rdd.flatMap(lambda x: [[(x[0], i % BANDS), hash] for i, hash in enumerate(x[1])]).groupByKey().cache()

for elem in hash_band_rdd.take(5):
    print(elem)

((50, 1), <pyspark.resultiterable.ResultIterable object at 0x7f4373329e50>)
((90, 13), <pyspark.resultiterable.ResultIterable object at 0x7f4373329c40>)
((146, 11), <pyspark.resultiterable.ResultIterable object at 0x7f4373329cd0>)
((163, 4), <pyspark.resultiterable.ResultIterable object at 0x7f4373329520>)
((192, 11), <pyspark.resultiterable.ResultIterable object at 0x7f4373329f10>)


In [14]:
hash_bands_grouped_rdd = hash_band_rdd.map(lambda x: [x[0][1], (x[1], x[0][0])])

In [15]:
band_hashed = hash_bands_grouped_rdd.map(hash_func).map(lambda x: [(x[0], x[1][0]), x[1][1]]).groupByKey().filter(lambda x: (len(x[1]) > 1 and len(x[1]) < 50))

for elem in band_hashed.take(10):
    print(elem[0])
    for b in elem[1]:
        print(b)

(5, 3650494298)
162493
162492
(4, 1055400709)
124032
358470
(4, 270235210)
47888
47887
(8, 1080253118)
164255
246984
(14, 3254858744)
254009
278392
(13, 685892717)
179332
179331
(8, 416895200)
205273
205274
(8, 1240301411)
8129
81110
(5, 264407448)
275252
275253
(2, 2422876973)
155250
155251


In [16]:
candidates = band_hashed.map(lambda x: (tuple(x[1]), 1)).reduceByKey(lambda a, b: ((float(a + b) / 10.0))).cache()

for elem in candidates.take(100):
    print(elem)

((205545, 205544), 0.11111111199999998)
((118903, 118902), 0.11111111111112)
((188788, 170671), 1)
((256241, 257123), 0.11112)
((47888, 47887), 0.11112)
((143967, 143966), 0.11111112000000001)
((275359, 275360), 0.11111111119999999)
((325492, 28258), 1)
((412954, 381202), 1)
((297047, 293341), 1)
((43826, 51122), 1)
((344566, 344522), 0.11112)
((310799, 176446), 1)
((404520, 35026), 1)
((132476, 132475), 0.11111112000000001)
((237342, 341375), 1)
((158885, 28193), 1)
((145707, 145708), 0.11200000000000002)
((110311, 297781), 1)
((376718, 48351), 1)
((368967, 251839), 1)
((350691, 350690), 0.11111111111112)
((161912, 167713), 0.11200000000000002)
((172863, 289540), 0.111111111112)
((129296, 405476), 1)
((250583, 386233), 1)
((179342, 179341), 0.11111111199999998)
((179345, 179346), 0.11111111119999999)
((46347, 39415), 0.11200000000000002)
((340783, 229454), 1)
((277966, 209484), 1)
((155231, 146906), 0.1111111111112)
((283944, 283955, 283942, 283943, 283946, 283954, 283947, 283949, 283

In [17]:
cand = candidates.collect()
shingle_dict = shingle_rdd.collectAsMap()

In [18]:
def calc_jaccard(list1, list2):
    return len(set(list1).intersection(list2)) / len(set(list1).union(list2))

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [19]:
import itertools

for elem in cand:
    #print(f"Candidates: {elem[0][0]}, {elem[0][1]}; similarity: ", end="")
    #print(f"{calc_jaccard(shingle_dict.get(elem[0][0]), shingle_dict.get(elem[0][1]))}")
    for comb in itertools.combinations(elem[0], 2):
        print(f"Candidates: {comb[0]}, {comb[1]}; similarity: ", end="")
        print(f"{calc_jaccard(shingle_dict.get(comb[0]), shingle_dict.get(comb[1]))}")

Candidates: 205545, 205544; similarity: 1.0
Candidates: 118903, 118902; similarity: 1.0
Candidates: 188788, 170671; similarity: 0.0
Candidates: 256241, 257123; similarity: 0.9572649572649573
Candidates: 47888, 47887; similarity: 0.9318181818181818
Candidates: 143967, 143966; similarity: 1.0
Candidates: 275359, 275360; similarity: 1.0
Candidates: 325492, 28258; similarity: 0.0
Candidates: 412954, 381202; similarity: 0.0
Candidates: 297047, 293341; similarity: 0.0
Candidates: 43826, 51122; similarity: 0.673469387755102
Candidates: 344566, 344522; similarity: 0.810126582278481
Candidates: 310799, 176446; similarity: 0.0
Candidates: 404520, 35026; similarity: 0.0
Candidates: 132476, 132475; similarity: 1.0
Candidates: 237342, 341375; similarity: 0.0
Candidates: 158885, 28193; similarity: 0.0
Candidates: 145707, 145708; similarity: 0.8695652173913043
Candidates: 110311, 297781; similarity: 0.0
Candidates: 376718, 48351; similarity: 0.0
Candidates: 368967, 251839; similarity: 0.0
Candidates:

# Exit Spark

In [20]:
spark.stop()