In [1]:
import dask
import dask.bag as db
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

import pandas as pd
from pandas import DataFrame as df
import re
from datetime import datetime
import numpy as np
import pickle

import seaborn as sns
import matplotlib.pyplot as plt

dask.config.set(scheduler='multiprocessing')
dask.config.set({"array.slicing.split_large_chunks": True})
    
%matplotlib inline

### Combine 3 Datasets

In [None]:
'''
TODO:
    Bring datasets in with dask
    Convert to pandas
    Combine datasets
    Parse Code and Context
'''

# Regex to extract code blocks
* Find max length block and transfer to new column
* Check for "ERROR" and drop columns

## Parse main code block

In [2]:
def extract_code_block(body):
    pattern = r'<code>(.*?)</code>'
    match = re.findall(pattern, body)
    return "Error" if not match else str(max(match, key=len))

### Testing

In [3]:
string1 = "Testing <code>long</code> something about this block <code>longer</code> xyz <code>longest</code>"
string2 = "Test without code"

In [4]:
df = pd.DataFrame()
df['body'] = [string1, string2]

In [5]:
df['block'] = df['body'].map(lambda x : extract_code_block(x))

In [6]:
df

Unnamed: 0,body,block
0,Testing <code>long</code> something about this...,longest
1,Test without code,Error


## Parse Body

In [3]:
def extract_body_block(body):
    pattern = r'<p>(.*?)</p>'
    code = r'(<.*?>)'
    match = re.findall(pattern, body)
    if not match:
        return "Error"
    else:
        match = [re.sub(code, "", m) for m in match]
        strings = [str(m).lower() for m in match]
        return '\n'.join(strings)

## Structure of x, y pre-clean

* Parse body and code for all rows
* drop all colums apart from id, accepted id, score, text and code
* pair code block of accepted with body of original into new frame

## Cleaning

* Compile code blocks?
* Clean y data, maybe remove contractions?
* Maybe generate new pairs to enrich language?



In [14]:
def parse_blocks(path):
    tmp = dd.read_parquet(path, engine='fastparquet')  

    print("Converting to Pandas Frame...")
    with ProgressBar():
        df = tmp.compute()
        df.reset_index()
        print("Done")

    print("Extracting body text...")
    with ProgressBar():
        df['text'] = df['body'].map(lambda x : extract_body_block(x))
        print("Done")
    
    print("Extracting body code...")
    with ProgressBar():
        df['code'] = df['body'].map(lambda x : extract_code_block(x))
        print("Done")
    
    df.drop(['score', 'title', 'tags', 'body', 'post_type_id'], axis='columns', inplace=True)

    return df

In [5]:
def remove_errors(frame):
    print("Removing empty code blocks...")
    frame = frame[frame['code'] != "Error"]
    print("Done")
    print("Removing empty text blocks...")
    frame = frame[frame['text'] != "Error"]
    print("Done")
    return frame

## NEED TO FIND OUT FORMAT OF ACCEPTED ANSWER 'answer_id'

In [None]:
def pairs(frame):
    pass

In [None]:
def extract_pipe(path):
    tmp = parse_blocks(path)
    tmp = remove_errors(tmp)
    frame = pairs(tmp)
    return frame

## Preparation

In [6]:
python = parse_blocks('D:\PROJECT\dataset_select\python.parq')
python = remove_errors(python)

Converting to Pandas Frame...
[########################################] | 100% Completed | 13.1s
Done
Extracting body text...
Done
Extracting body code...
Done
Removing empty code blocks...
Done
Removing empty text blocks...
Done


In [7]:
java = parse_blocks('D:\PROJECT\dataset_select\java.parq')
java = remove_errors(java)

Converting to Pandas Frame...
[########################################] | 100% Completed | 22.0s
Done
Extracting body text...
Done
Extracting body code...
Done
Removing empty code blocks...
Done
Removing empty text blocks...
Done


In [8]:
c = parse_blocks('D:\PROJECT\dataset_select\c.parq')
c = remove_errors(c)

Converting to Pandas Frame...
[########################################] | 100% Completed |  8.6s
Done
Extracting body text...
Done
Extracting body code...
Done
Removing empty code blocks...
Done
Removing empty text blocks...
Done


Unnamed: 0,id,post_type_id,accepted_answer_id,text,code
0,4,1,7,i want to use a track-bar to change a form's o...,Track-Bar
2,9,1,1404,given a datetime representing a person's birth...,DateTime
3,11,1,1248,"given a specific datetime value, how do i disp...",DateTime
16,59,1,43110,i have a datatable with a name column. i want ...,DataTable
38,174,1,0,i want to print html from a c# web service. t...,dialogArguments.___IE_PrintType
...,...,...,...,...,...
9243,69058919,1,0,admit i have 2 entities:\ncategory\nsubcategor...,OnModelCreating
9414,69059282,1,0,i am making a little project with react.js and...,Migrate entirely to HTTPS to have cookies sent...
9475,69059400,1,0,i'm trying to report mysqlexceptions to my exc...,Console.ReadKey()
10002,69060492,1,0,using .net core 3.1 in a non-production enviro...,UseDeveloperExceptionPage()
