In [1]:
import time
from datetime import timedelta

import html
import re

import os
import ntpath

import subprocess as sp

import numpy as np
import pandas as pd
import xml.etree.ElementTree as et

import dask
import dask.dataframe as dd
import dask.bag as bd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

## Read csv files Java Answers

In [2]:
df = dd.read_csv('pmdcodesnippetsviolations_csv/PMDJavaCodeSnippetsViolations*.csv', engine='python', error_bad_lines=False, warn_bad_lines=False, dtype=object)


## Start a Dask cluster using SLURM jobs as workers

In [3]:
#http://jobqueue.dask.org/en/latest/generated/dask_jobqueue.SLURMCluster.html
dask.config.set(
    {
        "distributed.worker.memory.target": False,  # avoid spilling to disk
        "distributed.worker.memory.spill": False,  # avoid spilling to disk
    }
)
cluster = SLURMCluster(
    cores=10, #cores=24, # we set each job to have 1 Worker, each using 10 cores (threads) and 8 GB of memory
    processes=2,
    memory="8GiB",
    walltime="0-00:30",# walltime="0-00:50",
    log_directory="../dask/logs",  # folder for SLURM logs for each worker
    local_directory="../dask",  # folder for workers data
)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 42475 instead


Spawn between 20 to 100 workers and connect a client to be able use them.

In [4]:
#cluster.scale(n=20) # ask for 20 jobs or workers
# This also works with adaptive clusters. This automatically launches and kill workers based on load.
# we tell our cluster to autoscale between 10 and 20 workers depending on the load
cluster.adapt(minimum_jobs=10, maximum_jobs=200)
#cluster.adapt(maximum_jobs=20)
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://192.168.94.140:46208  Dashboard: http://192.168.94.140:42475/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [5]:
df

Unnamed: 0_level_0,beginline,endline,begincolumn,endcolumn,rule,ruleset,class,externalInfoUrl,priority,text
npartitions=204,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
,object,object,object,object,object,object,object,object,object,object
,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...


In [6]:
pmd_codeviolations_init_len = df.shape[0].compute()
print('Initial number of volations {}'.format(pmd_codeviolations_init_len))

Initial number of volations 4896537


In [7]:
# drop empty columns
df = df[~df['class'].isna()] # Drop rows that have NaN in the Code column

In [8]:
pmd_codeviolations_nonan_len =df.shape[0].compute()
print('Number of violations after removing the nans in the class column {}'.format(pmd_codeviolations_nonan_len))

Number of violations after removing the nans in the class column 4887262


In [9]:
print('Difference {}'.format(pmd_codeviolations_init_len - pmd_codeviolations_nonan_len))

Difference 9275


In [10]:
# drop classes that do not have this pattern e.g., Code_10000096_9914015_1959_0
#df = df[df['class'].str.contains('^Code_\d+_\d+_\d+_\d+', regex=True)]
df = df[df['class'].str.contains('^Code_\d+_\d+_\d+_\d+$', regex=True)]

In [11]:
pmd_codeviolations_unkownclass_len = df.shape[0].compute()
print('Number of violations after removing the onces caused by unknown class e.g. inner classes {}'.format(pmd_codeviolations_nonan_len))

Number of violations after removing the onces caused by unknown class e.g. inner classes 4887262


In [12]:
print('Difference {}'.format(pmd_codeviolations_nonan_len - pmd_codeviolations_unkownclass_len))

Difference 499061


### Group dataframe by ruleset

In [13]:
grp_df = df.groupby('ruleset')

In [14]:
# get just the 1st of each group
grp_df.first().head()

Unnamed: 0_level_0,beginline,endline,begincolumn,endcolumn,rule,class,externalInfoUrl,priority,text
ruleset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Best Practices,8,8,9,26,SystemPrintln,Code_10000096_9914015_1959_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,2,System.out.println is used
Code Style,1,11,8,1,ClassNamingConventions,Code_10000096_9914015_1959_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,1,The class name 'Code_10000096_9914015_1959_0' ...
Design,1,11,43,1,UseUtilityClass,Code_10000096_9914015_1959_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,All methods are static. Consider using a util...
Documentation,20,21,62,9,UncommentedEmptyMethodBody,Code_1000205_995514_363_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,Document empty method body
Error Prone,2,2,16,23,BeanMembersShouldSerialize,Code_10000948_10000578_714_1,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,"Found non-transient, non-static member. Please..."


In [15]:
# Get all the values belonging to, say Best Practices
best_practices = grp_df.get_group('Best Practices').compute()
best_practices.head()

Unnamed: 0,beginline,endline,begincolumn,endcolumn,rule,ruleset,class,externalInfoUrl,priority,text
11,8,8,9,26,SystemPrintln,Best Practices,Code_10000096_9914015_1959_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,2,System.out.println is used
26,18,18,13,30,SystemPrintln,Best Practices,Code_10000439_9999961_2190_4,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,2,System.out.println is used
38,44,44,24,24,UnusedAssignment,Best Practices,Code_10000439_9999961_2190_4,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,The initializer for variable 'read' is never u...
74,3,3,21,24,UnusedPrivateField,Best Practices,Code_10001108_10000983_1098_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,Avoid unused private fields such as 'mBoo'.
78,10,10,31,39,UnusedPrivateField,Best Practices,Code_10001108_10000983_1098_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,Avoid unused private fields such as 'mOnChange'.


In [16]:
# reset index
best_practices.reset_index(drop=True, inplace=True)
best_practices.head()

Unnamed: 0,beginline,endline,begincolumn,endcolumn,rule,ruleset,class,externalInfoUrl,priority,text
0,8,8,9,26,SystemPrintln,Best Practices,Code_10000096_9914015_1959_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,2,System.out.println is used
1,18,18,13,30,SystemPrintln,Best Practices,Code_10000439_9999961_2190_4,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,2,System.out.println is used
2,44,44,24,24,UnusedAssignment,Best Practices,Code_10000439_9999961_2190_4,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,The initializer for variable 'read' is never u...
3,3,3,21,24,UnusedPrivateField,Best Practices,Code_10001108_10000983_1098_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,Avoid unused private fields such as 'mBoo'.
4,10,10,31,39,UnusedPrivateField,Best Practices,Code_10001108_10000983_1098_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,Avoid unused private fields such as 'mOnChange'.


In [17]:
# Randomly select that number
#https://datatofish.com/random-rows-pandas-dataframe/
best_practices_ran = best_practices.sample(n=46)
best_practices_ran.head()

Unnamed: 0,beginline,endline,begincolumn,endcolumn,rule,ruleset,class,externalInfoUrl,priority,text
49708,2,2,15,17,UnusedPrivateMethod,Best Practices,Code_41861554_33558564_1165_1,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,Avoid unused private methods such as 'h()'.
251518,27,27,17,34,SystemPrintln,Best Practices,Code_64915718_64914778_204_14,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,2,System.out.println is used
20201,7,7,5,23,AvoidPrintStackTrace,Best Practices,Code_39106811_39106614_343_0,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,3,Avoid printStackTrace(); use a logger call ins...
377309,5,5,5,22,SystemPrintln,Best Practices,Code_22691362_22690261_129_1,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,2,System.out.println is used
409483,17,17,13,28,SystemPrintln,Best Practices,Code_26009252_20134992_4602_1,https://pmd.github.io/pmd-6.39.0/pmd_rules_jav...,2,System.out.print is used


## Make a folder in that directory

In [18]:
## Make a folder in that directory
#folder = '{}/sample_pmd_csv'.format(common_path)
folder = 'pmd_sample'.format()
# output: path/to/Post.csv => path/to
mkdir_cmd = 'mkdir {}'.format(folder)
cmd = sp.run(
    mkdir_cmd, # command
    capture_output=True,
    text=True,
    shell=True
)

In [19]:
#print(folder)

In [20]:
#create a folder

In [21]:
# Select these files and dump them somewhere
# copy these files to another folder
for filename_to_copy in best_practices_ran['class']:
    #print('{}.java'.format(filename))
    file_to_copy = '{}.java'.format(filename_to_copy)
    copy_file_cmd = 'cp codesnippets_java/{} {}'.format(file_to_copy, folder)
    cmd = sp.run(
        copy_file_cmd, # command
        capture_output=True,
        text=True,
        shell=True
    )