# posts.xml 72 GB of Stackoverflow Posts
    
####    row: attributes
  - Id
  - Body
  - Tags

# Text Processing
For each post 
  - Get ID 
  - Get Tags
  - Get Code Snippets from each post Body 
  - save them based on the post id in a csv
  
 # References 
 [Reading Wikipedia XML Dumps with Python](https://www.heatonresearch.com/2017/03/03/python-basic-wikipedia-parsing.html)

In [3]:
#Import libraries 
import xml.etree.ElementTree as ET
import codecs
import csv
import time
import os
from bs4 import BeautifulSoup
import re


In [4]:
#Init Variables
PATH_POSTS_XML = 'E:\stackoverflow.com-Posts'
FILENAME_POSTS = 'Posts.xml'
FILENAME_CODE_TAGS = 'PostsCode_Tags.csv'
ENCODING = "utf-8"
pathPostsXML = os.path.join(PATH_POSTS_XML, FILENAME_POSTS)
pathPosts_CODE_TAGS = os.path.join(PATH_POSTS_XML, FILENAME_CODE_TAGS)
totalPostCount = 0
CodeCount = 0
TagsCount = 0


In [5]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

In [None]:
# Initialize csv Header
start_time = time.time()
with codecs.open(pathPosts_CODE_TAGS, "w", ENCODING) as PostsFH:
    postsWriter = csv.writer(PostsFH, quoting=csv.QUOTE_MINIMAL)
    postsWriter.writerow(['id', 'Codes', 'Tags'])
    for event, elem in ET.iterparse("E:\stackoverflow.com-Posts\posts.xml", events=("start","end")):
        if elem.tag == "row" and event == "end":
            totalPostCount+=1
            #Get post Id 
            postId = elem.attrib['Id']
            #Parse Body 
            codeSoup = BeautifulSoup(elem.attrib['Body'])
            #Get Code Elements
            codeElements = codeSoup.find_all("code")
            CodeCount += len(codeElements)
            codeString = " ".join(str(x) for x in codeElements)
            #Get Tags
            tagString = ""
            if 'Tags' in elem.attrib:
                tagsList = re.findall(r"<(.*?)>", elem.attrib['Tags'])
                TagsCount += len(tagsList)
                tagString =  ", ".join(tag for tag in tagsList)
            postsWriter.writerow([postId, codeString, tagString])
            if totalPostCount > 1 and (totalPostCount % 100000) == 0:
                print("{:,}".format(totalPostCount))
            elem.clear()

In [7]:
elapsed_time = time.time() - start_time

print("Total Posts: {:,}".format(totalPostCount))
print("Code Snippets: {:,}".format(CodeCount))
print("Tags: {:,}".format(TagsCount))

Total Posts: 45,919,817
Code Snippets: 109,734,471
Tags: 109,734,471


In [11]:
    """
    Splits a CSV file into multiple pieces.
    
    A quick bastardization of the Python CSV library.
    Arguments:
        `row_limit`: The number of rows you want in each output file. 10,000 by default.
        `output_name_template`: A %s-style template for the numbered output files.
        `output_path`: Where to stick the output files.
        `keep_headers`: Whether or not to print the headers in each output file.
    Example usage:
    
        >> from toolbox import csv_splitter;
        >> csv_splitter.split(open('/home/ben/input.csv', 'r'));
    
    """
def split(filehandler, delimiter=',', row_limit=10000,
          output_name_template='output_%s.csv', output_path='.', keep_headers=True):
        reader = csv.reader(filehandler, delimiter=delimiter)
        current_piece = 1
        current_out_path = os.path.join(output_path,output_name_template  % current_piece)
        current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
        current_limit = row_limit
        if keep_headers:
            headers = reader.next()
            current_out_writer.writerow(headers)
        for i, row in enumerate(reader):
            if i + 1 > current_limit:
                current_piece += 1
                current_limit = row_limit * current_piece
                current_out_path = os.path.join(output_path,output_name_template  % current_piece)
                current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
            if keep_headers:
                current_out_writer.writerow(headers)
        current_out_writer.writerow(row)