In [2]:
import dask
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

import pandas as pd
from pandas import DataFrame as df

import numpy as np
import re
import csv

dask.config.set(scheduler='multiprocessing')

<dask.config.set at 0x2197e1766a0>

### Combine 3 Datasets

In [None]:
'''
TODO:
    Function to efficiently search for answer code block values
    Execute code blocks (ANTLR)
'''

# Regex to extract code blocks
* Find max length block and transfer to new column
* Check for "ERROR" and drop columns

## Parse main code block

In [71]:
def extract_code_block(body) -> str:
    pattern = r'<code>(.*?)</code>'
    match = re.findall(pattern, body)
    return "Error" if not match else str(max(match, key=len))

### Testing

In [72]:
string1 = "Testing <code>long</code> something about this block <code>longer</code> xyz <code>longest</code>"
string2 = "Test without code"

In [73]:
df = pd.DataFrame()
df['body'] = [string1, string2]

In [74]:
df['block'] = df['body'].map(lambda x : extract_code_block(x))

In [75]:
df

Unnamed: 0,body,block
0,Testing <code>long</code> something about this...,longest
1,Test without code,Error


## Parse Body

In [4]:
def extract_body_block(body) -> str:
    pattern = r'<p>(.*?)</p>'
    code = r'(<.*?>)'
    match = re.findall(pattern, body)
    if not match:
        return "Error"
    else:
        match = [re.sub(code, "", m) for m in match]
        strings = [str(m).lower() for m in match]
        return '\n'.join(strings)

## Structure of x, y pre-clean

* Parse body and code for all rows
* drop all columns apart from id, accepted id and text
* pair code block of accepted with body of original into new frame

## Cleaning

* Compile code blocks?
* Clean y data, maybe remove contractions?
* Maybe generate new pairs to enrich language?



In [5]:
def to_pandas(path) -> pd.DataFrame:
    tmp = dd.read_parquet(path, engine='fastparquet')  
    print("Converting to Pandas Frame...")
    with ProgressBar():
        df = tmp.compute()
        print("Done")
    return df

In [6]:
def parse_body_blocks(path) -> pd.DataFrame:
    tmp = dd.read_parquet(path, engine='fastparquet')  

    print("Converting to Pandas Frame...")
    with ProgressBar():
        df = tmp.compute()
        df.reset_index()
        print("Done")

    print("Extracting body text...")
    with ProgressBar():
        df['text'] = df['body'].map(lambda x : extract_body_block(x))
        print("Done")
    
    df.drop(['id', 'score', 'title', 'tags', 'body', 'post_type_id'], axis='columns', inplace=True)
    df = remove_errors(df, 'text')

    return df

In [7]:
def remove_errors(frame, type) -> pd.DataFrame:
    print("Removing empty {} blocks...".format(type))
    frame = frame[frame[type] != "Error"]
    print("Done")
    return frame

In [61]:
def pairs(frame, y_path) -> pd.DataFrame:
    df = pd.read_csv(y_path, engine='python', encoding='utf-8')
    
    df['accepted_answer_id'] = df['accepted_answer_id'].astype(str)
    df.convert_dtypes()

    df = pd.merge(df, frame, how='inner', left_on='accepted_answer_id', right_on='id')

    return df

In [8]:
def prep_y(path, out) -> None:
    tmp = parse_body_blocks(path)
    tmp = tmp[tmp['accepted_answer_id']!= 0]
    tmp.to_csv(out)

In [2]:
def y_indices(paths) -> list:
    indices = list()
    for path in paths:
        df = pd.read_csv(path, engine='python', encoding='utf-8')
        indices.append(df['accepted_answer_id'])
    return pd.concat(indices).to_list()

## Preparation

### Each language set is parsed and stored in a new csv

In [21]:
prep_y('D:\PROJECT\dataset_select\python.parq', 'D:\PROJECT\dataset_select\y\python.csv')
prep_y('D:\PROJECT\dataset_select\java.parq', 'D:\PROJECT\dataset_select\y\java.csv')
prep_y('D:\PROJECT\dataset_select\c.parq', 'D:\PROJECT\dataset_select\y\c.csv')

Converting to Pandas Frame...
[########################################] | 100% Completed | 14.3s
Done
Extracting body text...
Done
Removing empty text blocks...
Done
Converting to Pandas Frame...
[########################################] | 100% Completed | 28.2s
Done
Extracting body text...
Done
Removing empty text blocks...
Done
Converting to Pandas Frame...
[########################################] | 100% Completed |  8.7s
Done
Extracting body text...
Done
Removing empty text blocks...
Done


## Creating new parquet with only answers

* Post type 2 corresponds to answers
* Other columns are dropped to save space during mapping

In [3]:
posts = dd.read_parquet('D:\PROJECT\dataset_parq\posts.parq', engine='fastparquet')

In [4]:
answers = posts.query("post_type_id == 2")
answers = answers.drop(['accepted_answer_id', 'score', 'title', 'tags', 'post_type_id'], axis='columns')
answers.repartition(npartitions=100)

Unnamed: 0_level_0,id,body
npartitions=100,Unnamed: 1_level_1,Unnamed: 2_level_1
,int64,object
,...,...
...,...,...
,...,...
,...,...


In [5]:
with ProgressBar():
    answers.to_csv('D:\PROJECT\dataset_select\clean_answers.csv', single_file=True)

[########################################] | 100% Completed | 10min 36.8s


## Retrieve all post indices and reduce answer set

In [6]:
indices = y_indices(['D:\PROJECT\dataset_select\y\python.csv', 'D:\PROJECT\dataset_select\y\java.csv', 'D:\PROJECT\dataset_select\y\c.csv'])

In [7]:
indices.sort()

In [14]:
df = pd.read_csv('D:\PROJECT\dataset_select\clean_answers.csv', chunksize=100000)

for chunk in df:
    chunk = chunk.where(chunk['id'].isin(indices)).dropna()
    chunk.to_csv('D:\PROJECT\dataset_select\clean_reduced.csv', mode='a')

In [29]:
answers = pd.read_csv('D:\PROJECT\dataset_select\clean_reduced.csv', engine='python', encoding='utf-8', dtype={'id':'str', 'body': 'str'})

In [42]:
answers.drop(answers.columns[answers.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

In [43]:
answers['id'] = answers['id'].map(lambda x : x.strip('.0'))

In [40]:
answers = answers.convert_dtypes() 

In [50]:
answers['id'] = answers['id'].astype(str)

## Merge posts and answers

In [62]:
python = pairs(answers, 'D:\PROJECT\dataset_select\y\python.csv')

In [64]:
c = pairs(answers, 'D:\PROJECT\dataset_select\y\c.csv')

In [65]:
java = pairs(answers, 'D:\PROJECT\dataset_select\y\java.csv')