In [1]:
import time
from datetime import timedelta

import html
import re

import os
import ntpath

import subprocess as sp

import numpy as np
import pandas as pd
import xml.etree.ElementTree as et

import dask
import dask.dataframe as dd
import dask.bag as bd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

## Read csv files Java Answers

In [2]:
df = dd.read_csv('checkstylexmlreports_csv/checkstyleReport*.csv', engine='python', error_bad_lines=False, warn_bad_lines=False, dtype=object)


## Start a Dask cluster using SLURM jobs as workers

In [3]:
#http://jobqueue.dask.org/en/latest/generated/dask_jobqueue.SLURMCluster.html
dask.config.set(
    {
        "distributed.worker.memory.target": False,  # avoid spilling to disk
        "distributed.worker.memory.spill": False,  # avoid spilling to disk
    }
)
cluster = SLURMCluster(
    cores=2, #cores=24, # we set each job to have 1 Worker, each using 10 cores (threads) and 8 GB of memory
    processes=2,
    memory="8GiB",
    walltime="0-00:30",# walltime="0-00:50",
    log_directory="../dask/logs",  # folder for SLURM logs for each worker
    local_directory="../dask",  # folder for workers data
)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 36291 instead


Spawn between 20 to 100 workers and connect a client to be able use them.

In [4]:
#cluster.scale(n=20) # ask for 20 jobs or workers
# This also works with adaptive clusters. This automatically launches and kill workers based on load.
# we tell our cluster to autoscale between 10 and 20 workers depending on the load
cluster.adapt(minimum_jobs=10, maximum_jobs=200)
#cluster.adapt(maximum_jobs=20)
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://192.168.94.140:44032  Dashboard: http://192.168.94.140:36291/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [5]:
df

Unnamed: 0_level_0,name,line,severity,message,source
npartitions=101,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,object,object,object,object,object
,...,...,...,...,...
...,...,...,...,...,...
,...,...,...,...,...
,...,...,...,...,...


In [6]:
checkstyle_codeviolations_init_len = df.shape[0].compute()
print('Initial number of volations {}'.format(checkstyle_codeviolations_init_len))

Initial number of volations 8074616


In [7]:
# drop empty columns
df = df[~df['name'].isna()] # Drop rows that have NaN in the Code column

In [8]:
checkstyle_codeviolations_nonan_len =df.shape[0].compute()
print('Number of violations after removing the nans in the class column {}'.format(checkstyle_codeviolations_nonan_len))

Number of violations after removing the nans in the class column 8074616


In [9]:
df.shape[0].compute()

8074616

#### Get the unique classes dataframe

In [10]:
# get the unique classes
unique_classes_df = df['name'].drop_duplicates().compute()

In [11]:
# This also counts the unique classes
unique_classes_df.shape[0]

731116

### Get the unique Violations

In [12]:
grp_vio = df.groupby('source')

In [13]:
#count the unique rulesets
grp_vio.first().head()

Unnamed: 0_level_0,name,line,severity,message
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
com.puppycrawl.tools.checkstyle.checks.ArrayTypeStyleCheck,Code_30381834_6434168_4179_3.java,2,error,Array brackets at illegal position.
com.puppycrawl.tools.checkstyle.checks.FinalParametersCheck,Code_30381834_6434168_4179_3.java,2,error,Parameter args should be final.
com.puppycrawl.tools.checkstyle.checks.NewlineAtEndOfFileCheck,Code_30381834_6434168_4179_3.java,1,error,File does not end with a newline.
com.puppycrawl.tools.checkstyle.checks.TodoCommentCheck,Code_47202560_47190189_667_0.java,76,error,Comment matches to-do format 'TODO:'.
com.puppycrawl.tools.checkstyle.checks.UpperEllCheck,Code_54470504_54469628_699_1.java,7,error,Should use uppercase 'L'.


### Count the number of times each items occured

In [14]:
#https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/
viol_count_df = pd.DataFrame(df.groupby('source')['source'].count().compute())
# rename the column to total
viol_count_df.columns=['total']
# drop the index level
viol_count_df = viol_count_df.reset_index()
viol_count_df

Unnamed: 0,source,total
0,com.puppycrawl.tools.checkstyle.checks.ArrayTy...,28859
1,com.puppycrawl.tools.checkstyle.checks.FinalPa...,863887
2,com.puppycrawl.tools.checkstyle.checks.Newline...,484311
3,com.puppycrawl.tools.checkstyle.checks.TodoCom...,1870
4,com.puppycrawl.tools.checkstyle.checks.UpperEl...,807
...,...,...
56,com.puppycrawl.tools.checkstyle.checks.whitesp...,186132
57,com.puppycrawl.tools.checkstyle.checks.whitesp...,988
58,com.puppycrawl.tools.checkstyle.checks.whitesp...,414293
59,com.puppycrawl.tools.checkstyle.checks.whitesp...,1038215


In [15]:
viol_count_df['percent'] = (viol_count_df['total']/ viol_count_df['total'].sum())*100

In [16]:
viol_count_df.head()

Unnamed: 0,source,total,percent
0,com.puppycrawl.tools.checkstyle.checks.ArrayTy...,28859,0.357404
1,com.puppycrawl.tools.checkstyle.checks.FinalPa...,863887,10.6988
2,com.puppycrawl.tools.checkstyle.checks.Newline...,484311,5.997945
3,com.puppycrawl.tools.checkstyle.checks.TodoCom...,1870,0.023159
4,com.puppycrawl.tools.checkstyle.checks.UpperEl...,807,0.009994


### Calculate number to be sampled per ruleset

In [17]:
import math
def comp_smpl_no(x):
    x = round((x/100)*384)
    if x < 1:
        x = 1
    return x

# Randomly select that number
#https://datatofish.com/random-rows-pandas-dataframe/
#chk_style_ran = df.sample(frac=384/len(df), replace=None, random_state=10)
viol_count_df['sample_size_384'] = viol_count_df['percent'].apply(comp_smpl_no)
viol_count_df.head()

Unnamed: 0,source,total,percent,sample_size_384
0,com.puppycrawl.tools.checkstyle.checks.ArrayTy...,28859,0.357404,1
1,com.puppycrawl.tools.checkstyle.checks.FinalPa...,863887,10.6988,41
2,com.puppycrawl.tools.checkstyle.checks.Newline...,484311,5.997945,23
3,com.puppycrawl.tools.checkstyle.checks.TodoCom...,1870,0.023159,1
4,com.puppycrawl.tools.checkstyle.checks.UpperEl...,807,0.009994,1


### Final Sample Used

In [18]:
viol_count_df['sample_size_384'].sum()

409

### Group by Source

In [19]:
grp_df = df.groupby('source')

In [20]:
grp_dfs = []

In [21]:
for vio, sample_size in zip(viol_count_df['source'], viol_count_df['sample_size_384']): # e.g., Best Practices, Coding Style, Design
    # get the last word after dots delemeter as well as convert to lower case
    subfolder = vio.lower().split('.')[-1]
    
    # Get all the values belonging to source violations, say ArrayType
    vio_grp = grp_df.get_group(vio)
    vio_grp = vio_grp.reset_index(drop=True).compute()
    #vio_grp.head()
    
    # Randomly select that number
    #https://datatofish.com/random-rows-pandas-dataframe/
    vio_grp_ran = vio_grp.sample(n=sample_size, random_state=10)
    # add the subfolder to get java file
    vio_grp_ran['sample_subfolder'] = subfolder
    # append the data frame to a list
    grp_dfs.append(vio_grp_ran)
    
    
    ## Make a folder in that directory
    folder = 'sample_ckstyl'
    # output: path/to/Post.csv => path/to
    mkdir_cmd = 'mkdir {}'.format(folder)
    cmd = sp.run(
        mkdir_cmd, # command
        capture_output=True,
        text=True,
        shell=True
    )
    
    ## Make a folder in that directory
    folder = 'sample_ckstyl/{}'.format(subfolder)
    # output: path/to/Post.csv => path/to
    mkdir_cmd = 'mkdir {}'.format(folder)
    cmd = sp.run(
        mkdir_cmd, # command
        capture_output=True,
        text=True,
        shell=True
    )
    
    # copy list of sample java files into the destination 
    for filename_to_copy in vio_grp_ran['name']:
        file_to_copy = '{}'.format(filename_to_copy)
        copy_file_cmd = 'cp codesnippets_java/{} {}'.format(file_to_copy, folder)
        cmd = sp.run(
            copy_file_cmd, # command
            capture_output=True,
            text=True,
            shell=True
        )
    
    
    

### Observed Test

In [22]:
# Combine all the group dataframe
com_grp_dfs = pd.concat(grp_dfs)
com_grp_dfs.head()

Unnamed: 0,name,line,severity,message,source,sample_subfolder
245,Code_35861549_10647520_592_0.java,6,error,Array brackets at illegal position.,com.puppycrawl.tools.checkstyle.checks.ArrayTy...,arraytypestylecheck
5866,Code_38850603_38849026_632_0.java,35,error,Parameter e should be final.,com.puppycrawl.tools.checkstyle.checks.FinalPa...,finalparameterscheck
5316,Code_30712003_30711571_426_0.java,38,error,Parameter args should be final.,com.puppycrawl.tools.checkstyle.checks.FinalPa...,finalparameterscheck
8192,Code_19237136_19236980_86_0.java,2,error,Parameter a should be final.,com.puppycrawl.tools.checkstyle.checks.FinalPa...,finalparameterscheck
496,Code_11361395_3389264_537_3.java,2,error,Parameter digits should be final.,com.puppycrawl.tools.checkstyle.checks.FinalPa...,finalparameterscheck


### Sort them in order

In [23]:
#com_grp_dfs.sort_values('name', ascending=True, inplace=True)
com_grp_dfs.sort_values(['sample_subfolder', 'name'], ascending=[True, True], inplace=True)

In [24]:
com_grp_dfs.head()

Unnamed: 0,name,line,severity,message,source,sample_subfolder
245,Code_35861549_10647520_592_0.java,6,error,Array brackets at illegal position.,com.puppycrawl.tools.checkstyle.checks.ArrayTy...,arraytypestylecheck
19,Code_35691321_35691064_861_0.java,13,error,Avoid nested blocks.,com.puppycrawl.tools.checkstyle.checks.blocks....,avoidnestedblockscheck
178,Code_15724003_15723834_378_2.java,3,error,Using the '.*' form of import should be avoide...,com.puppycrawl.tools.checkstyle.checks.imports...,avoidstarimportcheck
347,Code_26706024_26705991_1048_1.java,2,error,Using the '.*' form of import should be avoide...,com.puppycrawl.tools.checkstyle.checks.imports...,avoidstarimportcheck
39,Code_50769992_50767941_1873_3.java,7,error,Name 'random' must match pattern '^[A-Z][A-Z0-...,com.puppycrawl.tools.checkstyle.checks.naming....,constantnamecheck


In [25]:
for subfol, name, ln, msg in zip(com_grp_dfs['sample_subfolder'], com_grp_dfs['name'], com_grp_dfs['line'], com_grp_dfs['message']):
    print('sample_ckstyl/{}/{} : line {} ==> {}'.format(subfol, name, ln, msg))

sample_ckstyl/arraytypestylecheck/Code_35861549_10647520_592_0.java : line 6 ==> Array brackets at illegal position.
sample_ckstyl/avoidnestedblockscheck/Code_35691321_35691064_861_0.java : line 13 ==> Avoid nested blocks.
sample_ckstyl/avoidstarimportcheck/Code_15724003_15723834_378_2.java : line 3 ==> Using the '.*' form of import should be avoided - java.awt.*.
sample_ckstyl/avoidstarimportcheck/Code_26706024_26705991_1048_1.java : line 2 ==> Using the '.*' form of import should be avoided - javax.swing.*.
sample_ckstyl/constantnamecheck/Code_50769992_50767941_1873_3.java : line 7 ==> Name 'random' must match pattern '^[A-Z][A-Z0-9]*(_[A-Z0-9]+)*$'.
sample_ckstyl/designforextensioncheck/Code_10852320_10850563_23_0.java : line 5 ==> Class 'Code_10852320_10850563_23_0' looks like designed for extension (can be subclassed), but the method 'm1' does not have javadoc that explains how to do that safely. If class is not designed for extension consider making the class 'Code_10852320_10850