In [1]:
import time
from datetime import timedelta

import html
import re

import os
import ntpath

import subprocess as sp

import numpy as np
import pandas as pd
import xml.etree.ElementTree as et

import dask
import dask.dataframe as dd
import dask.bag as bd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

In [2]:
# Enter Path to Code Violations CSV File
code_violation_path = 'pmdcodesnippetsviolations_csv/PMDJavaCodeSnippetsViolations*.csv'
#code_violation_path = 'pmdcodesnippetsviolation_csv/pmdCodeSnippetsViolation*.csv'

# Enter Path to Real Java Code Files before performing the PMD code analisis
java_code_snippets_b4_pmd_ana = 'codesnippets_java'

## Read csv files Java Answers

In [3]:
df = dd.read_csv(code_violation_path, engine='python', error_bad_lines=False, warn_bad_lines=False, dtype=object)

## Start a Dask cluster using SLURM jobs as workers

In [4]:
#http://jobqueue.dask.org/en/latest/generated/dask_jobqueue.SLURMCluster.html
dask.config.set(
    {
        "distributed.worker.memory.target": False,  # avoid spilling to disk
        "distributed.worker.memory.spill": False,  # avoid spilling to disk
    }
)
cluster = SLURMCluster(
    cores=10, #cores=24, # we set each job to have 1 Worker, each using 10 cores (threads) and 8 GB of memory
    processes=2,
    memory="8GiB",
    walltime="0-00:30",# walltime="0-00:50",
    log_directory="../dask/logs",  # folder for SLURM logs for each worker
    local_directory="../dask",  # folder for workers data
)

Spawn between 20 to 100 workers and connect a client to be able use them.

In [5]:
#cluster.scale(n=20) # ask for 20 jobs or workers
# This also works with adaptive clusters. This automatically launches and kill workers based on load.
# we tell our cluster to autoscale between 10 and 20 workers depending on the load
cluster.adapt(minimum_jobs=10, maximum_jobs=200)
#cluster.adapt(maximum_jobs=20)
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://192.168.94.140:44847  Dashboard: http://192.168.94.140:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [6]:
df

Unnamed: 0_level_0,beginline,endline,begincolumn,endcolumn,rule,ruleset,class,externalInfoUrl,priority,text
npartitions=204,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
,object,object,object,object,object,object,object,object,object,object
,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...


In [7]:
pmd_codeviolations_init_len = df.shape[0].compute()
print('Initial number of volations {}'.format(pmd_codeviolations_init_len))

Initial number of volations 4896537


In [8]:
# drop empty columns
df = df[~df['class'].isna()] # Drop rows that have NaN in the Code column

In [9]:
pmd_codeviolations_nonan_len =df.shape[0].compute()
print('Number of violations after removing the nans in the class column {}'.format(pmd_codeviolations_nonan_len))

Number of violations after removing the nans in the class column 4887262


In [10]:
print('Difference {}'.format(pmd_codeviolations_init_len - pmd_codeviolations_nonan_len))

Difference 9275


In [11]:
# drop classes that do not have this pattern e.g., Code_10000096_9914015_1959_0
#df = df[df['class'].str.contains('^Code_\d+_\d+_\d+_\d+', regex=True)]
df = df[df['class'].str.contains('^Code_\d+_\d+_\d+_\d+$', regex=True)]

In [12]:
pmd_codeviolations_unkownclass_len = df.shape[0].compute()
print('Number of violations after removing the onces caused by unknown class e.g. inner classes {}'.format(pmd_codeviolations_nonan_len))

Number of violations after removing the onces caused by unknown class e.g. inner classes 4887262


In [13]:
print('Difference {}'.format(pmd_codeviolations_nonan_len - pmd_codeviolations_unkownclass_len))

Difference 499061


#### Count the unique rulesets

In [14]:
#count the unique rulesets
#df.groupby('ruleset').count().shape[0].compute()
#df.groupby('rule').count().shape[0].compute()

#### Get the unique classes dataframe

In [15]:
# get the unique classes
# not the values in the 1st column are where it found the 1st occurances of each unique item
#df['ruleset'].drop_duplicates().compute()

In [16]:
#rule_df = df[['ruleset']]
rule_df = df[['rule']]
#ruleset_df.head()

In [17]:
# This also counts the rows
rule_df.index.shape[0].compute()

4388201

In [18]:
#  This also counts the unique rows
#df.groupby('ruleset').count().shape[0].compute()
df.groupby('rule').count().shape[0].compute()

219

### Count the number of times each item occured

In [19]:
#https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/
rule_count_df = pd.DataFrame(rule_df.groupby('rule')['rule'].count().compute())
# rename the column to total
rule_count_df.columns=['total']
# drop the index level
rule_count_df = rule_count_df.reset_index()
rule_count_df

Unnamed: 0,rule,total
0,AbstractClassWithoutAbstractMethod,1704
1,AbstractClassWithoutAnyMethod,744
2,AddEmptyString,5681
3,AppendCharacterWithChar,3710
4,ArrayIsStoredDirectly,1745
...,...,...
214,FinalizeOnlyCallsSuperFinalize,7
215,CheckSkipResult,6
216,LocalHomeNamingConvention,1
217,StringBufferInstantiationWithChar,4


### Calculate Percentage

In [20]:
rule_count_df['percent'] = (rule_count_df['total']/ rule_count_df['total'].sum())*100

In [21]:
rule_count_df

Unnamed: 0,rule,total,percent
0,AbstractClassWithoutAbstractMethod,1704,0.038831
1,AbstractClassWithoutAnyMethod,744,0.016955
2,AddEmptyString,5681,0.129461
3,AppendCharacterWithChar,3710,0.084545
4,ArrayIsStoredDirectly,1745,0.039766
...,...,...,...
214,FinalizeOnlyCallsSuperFinalize,7,0.000160
215,CheckSkipResult,6,0.000137
216,LocalHomeNamingConvention,1,0.000023
217,StringBufferInstantiationWithChar,4,0.000091


### Calculate number to be sampled per ruleset

In [22]:
# https://www.surveysystem.com/sscalc.htm
# put 4887262 into the sample size calculator at 5% confidence interval
#ruleset_count_df['perc384'] = round((ruleset_count_df['percent']/ 100)*384, 0)
#ruleset_count_df

In [23]:
#### Better computation
import math

def comp_smpl_no(x):
    x = round((x/100)*384)
    if x < 1:
        x = 1
    return x
rule_count_df['sample_size_384'] = rule_count_df['percent'].apply(comp_smpl_no)
rule_count_df

Unnamed: 0,rule,total,percent,sample_size_384
0,AbstractClassWithoutAbstractMethod,1704,0.038831,1
1,AbstractClassWithoutAnyMethod,744,0.016955,1
2,AddEmptyString,5681,0.129461,1
3,AppendCharacterWithChar,3710,0.084545,1
4,ArrayIsStoredDirectly,1745,0.039766,1
...,...,...,...,...
214,FinalizeOnlyCallsSuperFinalize,7,0.000160,1
215,CheckSkipResult,6,0.000137,1
216,LocalHomeNamingConvention,1,0.000023,1
217,StringBufferInstantiationWithChar,4,0.000091,1


### Final Sample Used

In [24]:
rule_count_df['sample_size_384'].sum()

545

### Group by Ruleset

In [25]:
grp_df = df.groupby('rule')

In [26]:
grp_df.first().head()

Unnamed: 0_level_0,beginline,endline,begincolumn,endcolumn,ruleset,class,externalInfoUrl,priority,text
rule,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AbstractClassWithoutAbstractMethod,3,19,17,1,Best Practices,Code_10119187_10117546_2825_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,This abstract class does not have any abstract...
AbstractClassWithoutAnyMethod,1,5,17,1,Design,Code_10242838_10242755_218_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,1,No abstract method which means that the keywor...
AddEmptyString,71,71,13,14,Performance,Code_10019347_9982894_2634_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,Do not add empty strings
AppendCharacterWithChar,9,9,23,25,Performance,Code_10049223_10049147_1819_1,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,Avoid appending characters as strings in Strin...
ArrayIsStoredDirectly,8,8,30,47,Best Practices,Code_10009891_10008708_1498_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,The user-supplied array 'itemnames' is stored ...


In [27]:
grp_dfs = []

In [28]:
for vio, sample_size in zip(rule_count_df['rule'], rule_count_df['sample_size_384']): # e.g., Best Practices, Coding Style, Design
    
    subfolder = vio.lower()
    
    # Get all the values belonging to, say Best Practices
    vio_grp = grp_df.get_group(vio)
    vio_grp = vio_grp.reset_index(drop=True).compute()
    #vio_grp.head()
    
    # Randomly select that number
    #https://datatofish.com/random-rows-pandas-dataframe/
    vio_grp_ran = vio_grp.sample(n=sample_size, random_state=10)
    vio_grp_ran['sample_subfolder'] = subfolder
    # append the data frame to a list
    grp_dfs.append(vio_grp_ran)
    
    ## Make a folder in that directory
    folder = 'sample_pmd'
    mkdir_cmd = 'mkdir {}'.format(folder)
    cmd = sp.run(
        mkdir_cmd, # command
        capture_output=True,
        text=True,
        shell=True
    )
    
    ## Make a folder in that directory
    folder = 'sample_pmd/{}'.format(subfolder)
    # output: path/to/Post.csv => path/to
    mkdir_cmd = 'mkdir {}'.format(folder)
    cmd = sp.run(
        mkdir_cmd, # command
        capture_output=True,
        text=True,
        shell=True
    )
    
    # copy list of sample java files into the destination 
    for filename_to_copy in vio_grp_ran['class']:
        file_to_copy = '{}.java'.format(filename_to_copy)
        #copy_file_cmd = 'cp codesnippets_java/{} {}'.format(file_to_copy, folder)#
        copy_file_cmd = 'cp {}/{} {}'.format(java_code_snippets_b4_pmd_ana, file_to_copy, folder)#
        
        cmd = sp.run(
            copy_file_cmd, # command
            capture_output=True,
            text=True,
            shell=True
        )

### Observed Test

In [29]:
# Combine all the group dataframe
com_grp_dfs = pd.concat(grp_dfs)
com_grp_dfs.head()

Unnamed: 0,beginline,endline,begincolumn,endcolumn,rule,ruleset,class,externalInfoUrl,priority,text,sample_subfolder
1,1,24,17,1,AbstractClassWithoutAbstractMethod,Best Practices,Code_19285266_19280680_341_1,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,This abstract class does not have any abstract...,abstractclasswithoutabstractmethod
1,1,3,13,4,AbstractClassWithoutAnyMethod,Design,Code_11109133_11108982_3813_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,1,No abstract method which means that the keywor...,abstractclasswithoutanymethod
19,33,33,45,46,AddEmptyString,Performance,Code_59116818_59116662_2175_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,Do not add empty strings,addemptystring
5,7,7,25,27,AppendCharacterWithChar,Performance,Code_9467897_9467854_2670_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,Avoid appending characters as strings in Strin...,appendcharacterwithchar
7,4,4,29,47,ArrayIsStoredDirectly,Best Practices,Code_22177928_22177569_1464_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,The user-supplied array 'commands' is stored d...,arrayisstoreddirectly


### Sort them in order

In [30]:
com_grp_dfs.sort_values('class', ascending=True, inplace=True)
#com_grp_dfs.sort_values('class', ascending=True, inplace=True)
com_grp_dfs.sort_values(['sample_subfolder','class'], ascending=[True, True], inplace=True)

In [31]:
com_grp_dfs.head()

Unnamed: 0,beginline,endline,begincolumn,endcolumn,rule,ruleset,class,externalInfoUrl,priority,text,sample_subfolder
1,1,24,17,1,AbstractClassWithoutAbstractMethod,Best Practices,Code_19285266_19280680_341_1,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,This abstract class does not have any abstract...,abstractclasswithoutabstractmethod
1,1,3,13,4,AbstractClassWithoutAnyMethod,Design,Code_11109133_11108982_3813_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,1,No abstract method which means that the keywor...,abstractclasswithoutanymethod
19,33,33,45,46,AddEmptyString,Performance,Code_59116818_59116662_2175_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,Do not add empty strings,addemptystring
5,7,7,25,27,AppendCharacterWithChar,Performance,Code_9467897_9467854_2670_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,Avoid appending characters as strings in Strin...,appendcharacterwithchar
7,4,4,29,47,ArrayIsStoredDirectly,Best Practices,Code_22177928_22177569_1464_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,The user-supplied array 'commands' is stored d...,arrayisstoreddirectly


In [32]:
for rls, subfol, cls, bln, eln, msg in zip(com_grp_dfs['ruleset'], com_grp_dfs['sample_subfolder'], com_grp_dfs['class'], com_grp_dfs['beginline'], com_grp_dfs['endline'], com_grp_dfs['text']):
    print('sample_pmd/{}/{}.java : lines {} to {} ==> {} ({})'.format(subfol, cls, bln, eln, msg, rls))

sample_pmd/abstractclasswithoutabstractmethod/Code_19285266_19280680_341_1.java : lines 1 to 24 ==> This abstract class does not have any abstract methods (Best Practices)
sample_pmd/abstractclasswithoutanymethod/Code_11109133_11108982_3813_0.java : lines 1 to 3 ==> No abstract method which means that the keyword is most likely used to prevent instantiation. Use a private or protected constructor instead. (Design)
sample_pmd/addemptystring/Code_59116818_59116662_2175_0.java : lines 33 to 33 ==> Do not add empty strings (Performance)
sample_pmd/appendcharacterwithchar/Code_9467897_9467854_2670_0.java : lines 7 to 7 ==> Avoid appending characters as strings in StringBuffer.append. (Performance)
sample_pmd/arrayisstoreddirectly/Code_22177928_22177569_1464_0.java : lines 4 to 4 ==> The user-supplied array 'commands' is stored directly. (Best Practices)
sample_pmd/assignmenttononfinalstatic/Code_51099286_51097885_2851_2.java : lines 6 to 6 ==> Possible unsafe assignment to a non-final stati