In [1]:
import time
from datetime import timedelta

import html
import re

import numpy as np
import pandas as pd
import xml.etree.ElementTree as et

import dask
import dask.dataframe as dd
import dask.bag as bd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

In [2]:
##########################################################
#Step 1. Format 'pmd_rules_results.xml' from Linux terminal#
##########################################################
# -'pmd_rules_results.xml' is not properly formated
# -We used linux format command to format 'pmd_rules_results.xml' to save it in 'pmd_rules_results_fmt.xml'
# $  xmllint --format pmd_rules_results.xml > pmd_rules_results_fmt.xml
##############################################################################
#Step2. Remove some lines in 'test_flaws_rem_sm_tags_fmt' from Linux terminal#
##############################################################################
# remove lines 1 to 2 and the last line of the 'pmd_rules_results_fmt.xml'
# MacOS $  sed -i '' '1,2d;$d' pmd_rules_results_fmt.xml 
# Linux  $  sed -i '1,2d;$d' pmd_rules_results_fmt.xml 
# blocksize breaks it into partitions
#pmd_rules_result_bd = bd.read_text('pmd_rules_results_fmt.xml', blocksize='10MB')
pmd_rules_result_bd = bd.read_text('pmd_rules_results_fmt.xml', blocksize='10MB', linedelimiter='</violation>\n')
#pmd_rules_result_bd = bd.read_text('pmd_rules_results_fmt.xml', blocksize=1e8, linedelimiter='</violation>\n')
#pmd_rules_result_bd = bd.read_text('pmd_rules_results_fmt.xml', blocksize=None, linedelimiter='</violation>\n')
#pmd_rules_result_bd = bd.read_text('pmd_rules_results.xml', linedelimiter='</violation>\n')

In [3]:
# check to see number of partitions
# find the number of partitions
#pmd_rules_result_bg.npartitions # or
pmd_rules_result_bd

dask.bag<bag-from-delayed, npartitions=558>

In [4]:
# look at the first row
pmd_rules_result_bd.take(1)



In [5]:
# look at the first 50 row
#pmd_rules_result_bd.take(50)

### We are intrested in getting the 'violation'
- so we can get rid of the files
- then filter

In [6]:
# replace all the '<file>' with '</file>' so that we can access it using xml.etree.ElementTree
violations_rows_bd = pmd_rules_result_bd.map(lambda line: re.sub(r'\s*<\s*file.*>|.*/file>', '', line))

In [7]:
# look at the first row
violations_rows_bd.take(1)



In [8]:
# look at the first 50 row
#violations_rows_bd.take(50)

In [9]:
#filter only <violation ...
# Regex can also be applied here
#violations_rows_top_bd = violations_rows_bd.filter(lambda line: re.findall(r'\s*<violation', line))
violations_rows_bd = violations_rows_bd.filter(lambda line: line.find('<violation' or '< violation') >= 0)

In [10]:
# look at the first row
violations_rows_bd.take(1)



In [11]:
# look at the first 50 row
#violations_rows_bd.take(50)

### Use ElementTree to get all the attrbutes and text in xmlor html tags

In [12]:
# This will give us just the attributes in the violation tag: <violation ...>
#violations_rows_bd = violations_rows_bd.map(lambda row: et.fromstring(row).attrib)

In [13]:
# This will give us just the text enclosed in the violation tag: <violation ...>
#violations_rows_bd = violations_rows_bd.map(lambda row: {'text':et.fromstring(row).text})

- To get both the the attrbutes and text in the violation tag 
    - We use add_text_to_dict_attrb() to achieve that

In [14]:
def add_text_to_dict_attrb(dictionary, txt):
    txt = txt.strip('\n')
    dictionary['text'] = txt
    return dictionary

In [15]:
# This will give us both the attributes in the violation tag and the text enclosed in the violation tag:
# <violation ...> ... </violation>
violations_rows_bd = violations_rows_bd.map(lambda row:  add_text_to_dict_attrb(et.fromstring(row).attrib, et.fromstring(row).text))


In [16]:
# look at the first row
violations_rows_bd.take(1)

({'beginline': '1',
  'endline': '11',
  'begincolumn': '43',
  'endcolumn': '1',
  'rule': 'UseUtilityClass',
  'ruleset': 'Design',
  'class': 'Code_10000096_9914015_1959_0',
  'externalInfoUrl': 'https://pmd.github.io/pmd-6.39.0/pmd_rules_java_design.html#useutilityclass',
  'priority': '3',

In [17]:
# look at the first 50 row
#violations_rows_bd.take(50)

In [18]:
# Convert Bags to Dataframes
df = violations_rows_bd.to_dataframe()

In [19]:
# View Stucture
df

Unnamed: 0_level_0,beginline,endline,begincolumn,endcolumn,rule,ruleset,class,externalInfoUrl,priority,text
npartitions=558,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
,object,object,object,object,object,object,object,object,object,object
,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...


In [20]:
df.partitions[1].compute()

Unnamed: 0,beginline,endline,begincolumn,endcolumn,rule,ruleset,class,externalInfoUrl,priority,text
0,12,12,11,27,LocalVariableCouldBeFinal,Code Style,Code_10334164_10333780_1673_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,Local variable 't' could be declared final
1,17,17,5,22,SystemPrintln,Best Practices,Code_10334164_10333780_1673_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,2,System.out.println is used
2,11,37,43,1,UseUtilityClass,Design,Code_10334290_10333934_802_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,All methods are static. Consider using a util...
3,11,37,8,1,ClassNamingConventions,Code Style,Code_10334290_10333934_802_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,1,The class name 'Code_10334290_10333934_802_0' ...
4,13,13,29,42,MethodArgumentCouldBeFinal,Code Style,Code_10334290_10333934_802_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,Parameter 'args' is not assigned and could be ...
...,...,...,...,...,...,...,...,...,...,...
24052,3,10,42,1,UseUtilityClass,Design,Code_1066606_1066557_1858_2,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,All methods are static. Consider using a util...
24053,3,10,8,1,ClassNamingConventions,Code Style,Code_1066606_1066557_1858_2,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,1,The class name 'Code_1066606_1066557_1858_2' d...
24054,5,5,25,37,MethodArgumentCouldBeFinal,Code Style,Code_1066606_1066557_1858_2,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,Parameter 'args' is not assigned and could be ...
24055,1,4,8,1,ClassNamingConventions,Code Style,Code_10666159_10666139_1999_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,1,The class name 'Code_10666159_10666139_1999_0'...


In [21]:
df.columns

Index(['beginline', 'endline', 'begincolumn', 'endcolumn', 'rule', 'ruleset',
       'class', 'externalInfoUrl', 'priority', 'text'],
      dtype='object')

In [22]:
#Index will not be particularly meaningful. Use reindex afterwards if necessary.
df.index

Dask Index Structure:
npartitions=558
    int64
      ...
    ...  
      ...
      ...
dtype: int64
Dask Name: to_dataframe, 2232 tasks

In [23]:
df.npartitions

558

In [24]:
int(df.npartitions/2)

279

In [25]:
len_dict = {}
for i in range(0, int(df.npartitions)):
    df1 = df.partitions[i].compute()
    value = df1.shape[0]
    if value==0:
        print('Empty Task {} completed!'.format(i))
    else:
        key = '{}'.format(i)
        len_dict[key]=value
        df1.to_csv('pmdcodesnippetsviolations_csv/PMDJavaCodeSnippetsViolations{}.csv'.format(i), sep=',', index=False)
        print('Task {} completed!'.format(i))

Task 0 completed!
Task 1 completed!
Task 2 completed!
Task 3 completed!
Task 4 completed!
Task 5 completed!
Task 6 completed!
Task 7 completed!
Task 8 completed!
Task 9 completed!
Task 10 completed!
Task 11 completed!
Task 12 completed!
Task 13 completed!
Task 14 completed!
Task 15 completed!
Task 16 completed!
Task 17 completed!
Task 18 completed!
Task 19 completed!
Task 20 completed!
Task 21 completed!
Task 22 completed!
Task 23 completed!
Task 24 completed!
Task 25 completed!
Task 26 completed!
Task 27 completed!
Task 28 completed!
Task 29 completed!
Task 30 completed!
Task 31 completed!
Task 32 completed!
Task 33 completed!
Task 34 completed!
Task 35 completed!
Task 36 completed!
Task 37 completed!
Task 38 completed!
Task 39 completed!
Task 40 completed!
Task 41 completed!
Task 42 completed!
Task 43 completed!
Task 44 completed!
Task 45 completed!
Task 46 completed!
Task 47 completed!
Task 48 completed!
Task 49 completed!
Task 50 completed!
Task 51 completed!
Task 52 completed!
Tas

In [26]:
values = len_dict.values()

In [27]:
values

dict_values([24105, 24057, 23964, 24134, 24016, 24099, 24072, 24160, 24104, 24071, 24107, 23910, 24071, 23877, 24064, 24286, 24207, 24139, 24038, 24060, 23969, 24087, 23942, 23864, 23966, 24035, 23933, 23991, 24030, 23979, 23991, 24074, 24016, 23921, 24075, 24059, 24118, 24143, 23991, 24038, 24057, 24010, 24228, 24171, 24013, 24036, 24012, 24244, 24096, 24078, 24024, 24048, 24060, 24043, 24059, 24109, 24179, 24092, 24188, 24109, 24074, 23948, 23926, 24087, 24113, 24103, 24109, 24076, 24183, 24080, 24019, 24063, 24146, 24145, 24158, 24093, 24216, 24064, 24064, 24204, 24127, 24154, 24072, 24143, 24136, 24137, 24136, 24155, 24120, 24117, 24111, 24130, 24192, 24063, 24124, 24143, 24211, 24054, 24041, 24094, 24043, 24003, 24097, 24185, 24055, 24014, 24139, 24240, 24156, 24143, 24159, 24163, 24225, 24166, 24148, 24114, 24207, 24102, 24130, 24071, 24156, 24076, 24078, 24084, 24069, 24122, 23979, 24079, 24212, 24117, 24185, 24111, 24213, 23987, 24050, 23972, 24060, 24129, 24208, 24083, 24031, 

In [28]:
total_len = sum(map(int, values))

In [29]:
print(total_len)

4896537


In [30]:
counter = 0
for key, value in len_dict.items():
    if value <= 0:
        counter+=1
        #print (key)
    
print(counter)

0
