# Setup spark

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] ="--conf spark.driver.memory=2g  pyspark-shell"

from pyspark.sql import SparkSession
from pyspark.sql.types import (StructType, StructField, StringType, 
                                FloatType, DateType, IntegerType)
spark = SparkSession \
        .builder \
        .master("local[*]") \
        .appName("BDA assignment") \
        .getOrCreate()

# Imports

In [2]:
from typing import NamedTuple, Final, List
#from lxml import etree
import xml.etree.ElementTree as ET
from itertools import islice, chain, combinations
import argparse
import traceback
import bleach
import html
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import pr
import string
import random
import hashlib
import pandas as pd

# Constants

In [3]:
SHINGLE_SIZE: Final = 5
SAMPLES: Final = 1000

In [4]:
class comment_tuple(NamedTuple):
    id: int
    #owner_id: int
    post_type: int
    score: int
    text: str

class shingle_set(NamedTuple):
    id: int
    shingles: frozenset[tuple]

class similarity(NamedTuple):
    id_set1: int
    id_set2: int
    similarity: float

# Read and clean XML

In [5]:
def set_schema():
    """
    Define the schema for the DataFrame
    """
    schema_list = []
    schema_list.append(StructField("Id", IntegerType(), True))
    schema_list.append(StructField("PostTypeId", IntegerType(), True))
    schema_list.append(StructField("Score", IntegerType(), True))
    schema_list.append(StructField("Body", StringType(), True))
    
    return StructType(schema_list)

def parse_post(rdd):
    results = []
    root = ET.fromstring(rdd[0])

    for elem in root.findall('row'):
        rec = []
        #print("Found row")
        assert elem.text is None, "The row wasn't empty"
        rec.append(int(elem.attrib["Id"]))
        #int(elem.attrib["OwnerUserId"]),
        rec.append(int(elem.attrib["PostTypeId"])),
        rec.append(int(elem.attrib["Score"])),
        rec.append(bleach.clean(elem.attrib["Body"], strip=True))
        #rec.append(elem.attrib["Body"])

        #elem.clear()
        #while elem.getprevious() is not None:
        #    del elem.getparent()[0]
        results.append(rec)

    return results

In [6]:
filename = "cstheory_posts.xml"
chunksize = 1024

schema = set_schema()

file_rdd = spark.read.text(filename, wholetext=True).rdd
records_rdd = file_rdd.flatMap(parse_post)
print(schema)

StructType(List(StructField(Id,IntegerType,true),StructField(PostTypeId,IntegerType,true),StructField(Score,IntegerType,true),StructField(Body,StringType,true)))


In [7]:
df_posts = records_rdd.toDF(schema)
coll = records_rdd.collect()


In [8]:
print(df_posts)
for i in range(20):
    print(coll[i])

DataFrame[Id: int, PostTypeId: int, Score: int, Body: string]
[2, 1, 13, 'I have a dataset which is a number of objects arranged in a 2-D grid.  I know I have a strict ordering, increasing as you go left-to-right within each row, and increasing as top-to-bottom within each column.  For example, \n\n<ul>\n<li>1 2 3</li>\n<li>4 6 7</li>\n<li>5 8 9</li>\n</ul>\n\nCan I improve on naive sorting to sort the entire dataset linearly (as measured in comparisons)?\n\nWhat about for n-d datasets?  Arbitrary finite datasets with a subset of comparisons known?\n']
[3, 1, 8, 'A particular programming problem I came across recently reduces to finding hamiltonian paths in a rectangular grid that would look something like,\n\n<code>A  0  0  0\n\n0  0  0  0\n\n0  0  C  D\n</code>\n\nWhat are some effective heuristics that could be applied to find them - and particularly, techniques to trim/discard paths along the way?\n\nEdit: Just to clarify, the edges are formed when elements are connected horizontal

In [26]:
df_posts.write.format("csv").mode("overwrite")\
            .save("./output")

KeyboardInterrupt: 

# Exit Spark

In [27]:
spark.stop()