In [1]:
import time
from datetime import timedelta

import html
import re

import numpy as np
import pandas as pd
import xml.etree.ElementTree as et

import dask
import dask.dataframe as dd
import dask.bag as bd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

In [2]:
##########################################################
#Step 1. Format 'pmd_rules_results.xml' from Linux terminal#
##########################################################
# -'pmd_rules_results.xml' is not properly formated
# -We used linux format command to format 'pmd_rules_results.xml' to save it in 'pmd_rules_results_fmt.xml'
# $  xmllint --format pmd_rules_results.xml > pmd_rules_results_fmt.xml
##############################################################################
#Step2. Remove some lines in 'test_flaws_rem_sm_tags_fmt' from Linux terminal#
##############################################################################
# remove lines 1 to 2 and the last line of the 'pmd_rules_results_fmt.xml'
# MacOS $  sed -i '' '1,2d;$d' pmd_rules_results_fmt.xml 
# Linux  $  sed -i '1,2d;$d' pmd_rules_results_fmt.xml 
# blocksize breaks it into partitions
#pmd_rules_result_bd = bd.read_text('pmd_rules_results_fmt.xml', blocksize='10MB')
#Use </error>\n as linedelimiter
pmd_rules_result_bd = bd.read_text('pmd_rules_results_fmt.xml', blocksize='10MB', linedelimiter='</error>\n')

In [3]:
# check to see number of partitions
# find the number of partitions
#pmd_rules_result_bg.npartitions # or
pmd_rules_result_bd

dask.bag<bag-from-delayed, npartitions=558>

In [4]:
# look at the first row too large to view one partition becouse the delimiter breaks at almost the tail of the file
# However, I have tried it with a smaller example on my local desktop
# pmd_rules_result_bd.take(1)

In [5]:
# look at the first row too large to view
#pmd_rules_result_bd.take(50)

### We are intrested in getting the 'errors'
- so we get rid of everything within the file tags
- including the violation tags

In [6]:
# Get rid of everything within the <file...>...</file> tags
# including the <violations ...>...</violations> tags
# which are also embedded within the file tags
#errors_rows_bd = pmd_rules_result_bd.map(lambda line: re.sub(r'\s*<\s*file.*>|.*/file>', '', line))
errors_rows_bd = pmd_rules_result_bd.map(lambda line: re.sub(r'\s*<\s*file.*>(.|\n)*?<\/file>', '', line))

In [7]:
# look at the first row
errors_rows_bd.take(1)

('\n  <error filename="/scale_wlg_nobackup/filesets/nobackup/uoo03396/SfTI_project_nobackup/SfTI_Projects/StackOverflow_project/my_codesnippet_analysis/codesnippets_java/Code_10000052_9999270_706_1.java" msg="PMDException: Error while parsing /scale_wlg_nobackup/filesets/nobackup/uoo03396/SfTI_project_nobackup/SfTI_Projects/StackOverflow_project/my_codesnippet_analysis/codesnippets_java/Code_10000052_9999270_706_1.java"><![CDATA[net.sourceforge.pmd.PMDException: Error while parsing /scale_wlg_nobackup/filesets/nobackup/uoo03396/SfTI_project_nobackup/SfTI_Projects/StackOverflow_project/my_codesnippet_analysis/codesnippets_java/Code_10000052_9999270_706_1.java\n\tat net.sourceforge.pmd.SourceCodeProcessor.processSourceCodeWithoutCache(SourceCodeProcessor.java:124)\n\tat net.sourceforge.pmd.SourceCodeProcessor.processSourceCode(SourceCodeProcessor.java:100)\n\tat net.sourceforge.pmd.SourceCodeProcessor.processSourceCode(SourceCodeProcessor.java:62)\n\tat net.sourceforge.pmd.processor.PmdR

In [8]:
# look at the first 50 row
#errors_rows_bd.take(50)

### Use ElementTree to get all the attrbutes and text in xmlor html tags

In [9]:
# This will give us just the attributes in the violation tag: <violation ...>
#errors_rows_bd = violations_rows_bd.map(lambda row: et.fromstring(row).attrib)

In [10]:
# This will give us just the text enclosed in the violation tag: <violation ...>
#errors_rows_bd = violations_rows_bd.map(lambda row: {'text':et.fromstring(row).text})

- To get both the the attrbutes and text in the violation tag 
    - We use add_text_to_dict_attrb() to achieve that

In [11]:
def add_text_to_dict_attrb(dictionary, txt):
    txt = txt.strip('\n')
    dictionary['error_text'] = txt
    return dictionary

In [12]:
# This will give us both the attributes in the violation tag and the text enclosed in the violation tag:
# <violation ...> ... </violation>
errors_rows_bd = errors_rows_bd.map(lambda row:  add_text_to_dict_attrb(et.fromstring(row).attrib, et.fromstring(row).text))


In [13]:
# look at the first row
errors_rows_bd.take(1)

({'filename': '/scale_wlg_nobackup/filesets/nobackup/uoo03396/SfTI_project_nobackup/SfTI_Projects/StackOverflow_project/my_codesnippet_analysis/codesnippets_java/Code_10000052_9999270_706_1.java',
  'msg': 'PMDException: Error while parsing /scale_wlg_nobackup/filesets/nobackup/uoo03396/SfTI_project_nobackup/SfTI_Projects/StackOverflow_project/my_codesnippet_analysis/codesnippets_java/Code_10000052_9999270_706_1.java',
  'error_text': 'net.sourceforge.pmd.PMDException: Error while parsing /scale_wlg_nobackup/filesets/nobackup/uoo03396/SfTI_project_nobackup/SfTI_Projects/StackOverflow_project/my_codesnippet_analysis/codesnippets_java/Code_10000052_9999270_706_1.java\n\tat net.sourceforge.pmd.SourceCodeProcessor.processSourceCodeWithoutCache(SourceCodeProcessor.java:124)\n\tat net.sourceforge.pmd.SourceCodeProcessor.processSourceCode(SourceCodeProcessor.java:100)\n\tat net.sourceforge.pmd.SourceCodeProcessor.processSourceCode(SourceCodeProcessor.java:62)\n\tat net.sourceforge.pmd.process

In [14]:
# look aterrors_rows_bdthe first 50 row
#violations_rows_bd.take(50)

In [15]:
# Convert Bags to Dataframes
df = errors_rows_bd.to_dataframe()

In [16]:
# View Stucture
df

Unnamed: 0_level_0,filename,msg,error_text
npartitions=558,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,object,object,object
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


In [17]:
df.partitions[1].compute()

Unnamed: 0,filename,msg,error_text


In [19]:
df.columns

Index(['filename', 'msg', 'error_text'], dtype='object')

In [20]:
#Index will not be particularly meaningful. Use reindex afterwards if necessary.
df.index

Dask Index Structure:
npartitions=558
    int64
      ...
    ...  
      ...
      ...
dtype: int64
Dask Name: to_dataframe, 2232 tasks

In [21]:
df.npartitions

558

In [22]:
int(df.npartitions/2)

279

In [23]:
len_dict = {}
for i in range(0, int(df.npartitions)):
    df1 = df.partitions[i].compute()
    value = df1.shape[0]
    if value==0:
        print('Empty Task {} completed!'.format(i))
    else:
        key = '{}'.format(i)
        len_dict[key]=value
        df1.to_csv('pmdcodesnippetserrors_csv/PMDJavaCodeSnippetsErrors{}.csv'.format(i), sep=',', index=False)
        print('Task {} completed!'.format(i))

Task 0 completed!
Empty Task 1 completed!
Empty Task 2 completed!
Empty Task 3 completed!
Empty Task 4 completed!
Empty Task 5 completed!
Empty Task 6 completed!
Empty Task 7 completed!
Empty Task 8 completed!
Empty Task 9 completed!
Empty Task 10 completed!
Empty Task 11 completed!
Empty Task 12 completed!
Empty Task 13 completed!
Empty Task 14 completed!
Empty Task 15 completed!
Empty Task 16 completed!
Empty Task 17 completed!
Empty Task 18 completed!
Empty Task 19 completed!
Empty Task 20 completed!
Empty Task 21 completed!
Empty Task 22 completed!
Empty Task 23 completed!
Empty Task 24 completed!
Empty Task 25 completed!
Empty Task 26 completed!
Empty Task 27 completed!
Empty Task 28 completed!
Empty Task 29 completed!
Empty Task 30 completed!
Empty Task 31 completed!
Empty Task 32 completed!
Empty Task 33 completed!
Empty Task 34 completed!
Empty Task 35 completed!
Empty Task 36 completed!
Empty Task 37 completed!
Empty Task 38 completed!
Empty Task 39 completed!
Empty Task 40 co

In [24]:
values = len_dict.values()

In [25]:
values

dict_values([1, 2260, 3451, 3444, 3436, 3428, 3436, 3432, 3435, 3437, 3436, 3449, 3445, 3440, 3447, 3428, 3461, 3436, 3437, 3451, 3453, 3434, 3445, 3448, 3433, 3440, 3441, 3441, 3459, 3449, 3451, 3455, 3463, 3442, 3440, 3455, 3441, 3462, 3446, 3453, 3445, 3438, 3442, 3456, 3451, 3422, 3445, 3441, 3439, 3451, 3444, 3445, 3440, 3442, 3432, 3448, 3429, 3453, 3453, 3464, 3448, 3459, 3444, 3463, 3472, 3463, 3464, 3437, 3460, 3431, 3468, 3467, 3438, 3452, 3452, 3455, 3433, 3436, 3461, 3445, 3461, 3442, 3446, 3452, 3462, 3440, 3444, 3439, 3456, 3452, 3443, 3434, 3442, 3423, 3442, 3431, 3422, 3433, 3443, 3434, 3443, 3450, 3426, 3451, 3446, 3456, 3452, 3440, 3454, 3437, 3455, 3455, 3451, 3469, 3443, 3444, 3452, 3449, 3442, 3434, 3444, 3444, 3456, 3444, 3453, 3442, 3460, 3442, 3442, 3462, 3451, 3456, 3445, 3445, 3445, 3440, 3453, 3436, 3426, 3442, 3421, 3443, 3437, 3437, 3430, 3456, 3414, 3438, 3413, 3421, 3438, 3436, 3436, 3454, 3447, 3457, 3457, 3455, 3460, 3452, 3453, 3460, 3441, 3453, 3435, 

In [26]:
total_len = sum(map(int, values))

In [27]:
print(total_len)

1217373
