In [1]:
import time
from datetime import timedelta
import html

import subprocess as sp

import pandas as pd
import dask
import dask.dataframe as dd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

## Read csv files Java Answers

In [2]:
df = dd.read_csv('spotbugreports_wt_idx_csv/spotbugReport*.csv', engine='python', error_bad_lines=False, warn_bad_lines=False, dtype=object)

#### Explore dataframe

In [3]:
## explore one patition
#one_pat_df = df.partitions[1].compute()

In [4]:
df

Unnamed: 0_level_0,category,type,sourcefile,shortMsg,bugline
npartitions=577,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,object,object,object,object,object
,...,...,...,...,...
...,...,...,...,...,...
,...,...,...,...,...
,...,...,...,...,...


## Start a Dask cluster using SLURM jobs as workers

In [5]:
#http://jobqueue.dask.org/en/latest/generated/dask_jobqueue.SLURMCluster.html
dask.config.set(
    {
        "distributed.worker.memory.target" : False,  # avoid spilling to disk
        "distributed.worker.memory.spill" : False,  # avoid spilling to disk
    }
)
cluster = SLURMCluster(
    cores=10, #cores=24, # we set each job to have 1 Worker, each using 10 cores (threads) and 8 GB of memory
    processes=2,
    memory="8GiB",
    walltime="0-00:30",# walltime="0-00:50",
    log_directory="../dask/logs",  # folder for SLURM logs for each worker
    local_directory="../dask",  # folder for workers data
)

Spawn between 20 to 100 workers and connect a client to be able use them.

In [6]:
#cluster.scale(n=20) # ask for 20 jobs or workers
# This also works with adaptive clusters. This automatically launches and kill workers based on load.
# we tell our cluster to autoscale between 10 and 20 workers depending on the load
cluster.adapt(minimum_jobs=10, maximum_jobs=200)
#cluster.adapt(maximum_jobs=20)
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://192.168.94.140:45527  Dashboard: http://192.168.94.140:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [7]:
df_init_len = df.shape[0].compute()
print('Initial number of volations {}'.format(df_init_len))

Initial number of volations 138036


In [8]:
df.head()

Unnamed: 0,category,type,sourcefile,shortMsg,bugline
0,PERFORMANCE,URF_UNREAD_FIELD,Code_10203353_10202514_4576_2.java,Unread field,lines 1-2
1,PERFORMANCE,URF_UNREAD_FIELD,Code_10203353_10202514_4576_2.java,Unread field,noline
2,PERFORMANCE,URF_UNREAD_FIELD,Code_10203353_10202514_4576_2.java,Unread field,line 2
3,PERFORMANCE,URF_UNREAD_FIELD,Code_10306917_10306647_1215_0.java,Unread field,line 2
4,PERFORMANCE,URF_UNREAD_FIELD,Code_10306917_10306647_1215_0.java,Unread field,lines 1-2


In [9]:
#df.tail()

In [10]:
# get all the records that nave the line where the error occured
df = df[~(df.bugline == 'noline')]

In [11]:
# originally, distinct number of type is 41545, 
# however, Sport bug reports errors such that some are repeated more than once for one class
# hence ve now have 98121
df.index.shape[0].compute()

98121

In [12]:
df.head()

Unnamed: 0,category,type,sourcefile,shortMsg,bugline
0,PERFORMANCE,URF_UNREAD_FIELD,Code_10203353_10202514_4576_2.java,Unread field,lines 1-2
2,PERFORMANCE,URF_UNREAD_FIELD,Code_10203353_10202514_4576_2.java,Unread field,line 2
3,PERFORMANCE,URF_UNREAD_FIELD,Code_10306917_10306647_1215_0.java,Unread field,line 2
4,PERFORMANCE,URF_UNREAD_FIELD,Code_10306917_10306647_1215_0.java,Unread field,lines 1-2
6,PERFORMANCE,URF_UNREAD_FIELD,Code_10488060_10487618_3943_0.java,Unread field,line 2


### Count the number of times each 'type' occured

In [13]:
#https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/
#viol_count_df = pd.DataFrame(df.groupby('category')['category'].count().compute())
viol_count_df = pd.DataFrame(df.groupby('type')['type'].count().compute())
# rename the column to total
viol_count_df.columns=['total']
# drop the index level
viol_count_df = viol_count_df.reset_index()
viol_count_df

Unnamed: 0,type,total
0,DM_STRING_CTOR,3263
1,SIC_INNER_SHOULD_BE_STATIC,5567
2,SS_SHOULD_BE_STATIC,2831
3,URF_UNREAD_FIELD,60510
4,UUF_UNUSED_FIELD,16932
5,WMI_WRONG_MAP_ITERATOR,686
6,MS_SHOULD_BE_FINAL,894
7,MS_FINAL_PKGPROTECT,229
8,DM_NEXTINT_VIA_NEXTDOUBLE,1892
9,MS_PKGPROTECT,711


### Calculate the percentage

In [14]:
viol_count_df['percent'] = (viol_count_df['total']/ viol_count_df['total'].sum())*100

In [15]:
viol_count_df.head()

Unnamed: 0,type,total,percent
0,DM_STRING_CTOR,3263,3.325486
1,SIC_INNER_SHOULD_BE_STATIC,5567,5.673607
2,SS_SHOULD_BE_STATIC,2831,2.885213
3,URF_UNREAD_FIELD,60510,61.668756
4,UUF_UNUSED_FIELD,16932,17.256245


### Calculate number to be sampled per ruleset
- to represent the distribution of the types of violations

In [16]:
import math
def comp_smpl_no(x):
    x = round((x/100)*384)
    if x < 1:
        x = 1
    return x

# Randomly select that number
#https://datatofish.com/random-rows-pandas-dataframe/
#chk_style_ran = df.sample(frac=384/len(df), replace=None, random_state=10)
viol_count_df['sample_size_384'] = viol_count_df['percent'].apply(comp_smpl_no)
viol_count_df.head()

Unnamed: 0,type,total,percent,sample_size_384
0,DM_STRING_CTOR,3263,3.325486,13
1,SIC_INNER_SHOULD_BE_STATIC,5567,5.673607,22
2,SS_SHOULD_BE_STATIC,2831,2.885213,11
3,URF_UNREAD_FIELD,60510,61.668756,237
4,UUF_UNUSED_FIELD,16932,17.256245,66


### Final Sample Used

In [17]:
viol_count_df['sample_size_384'].sum()

396

### Group by Source

In [18]:
grp_df = df.groupby('type')

In [19]:
grp_dfs = []

In [20]:
for vio, sample_size in zip(viol_count_df['type'], viol_count_df['sample_size_384']): # e.g., Best Practices, Coding Style, Design
    # get the last word after dots delemeter as well as convert to lower case
    subfolder = vio.lower()
    
    # Get all the values belonging to source violations, say ArrayType
    vio_grp = grp_df.get_group(vio)
    vio_grp = vio_grp.reset_index(drop=True).compute()
    #vio_grp.head()
    
    # Randomly select that number
    #https://datatofish.com/random-rows-pandas-dataframe/
    vio_grp_ran = vio_grp.sample(n=sample_size, random_state=10)
    # add the subfolder to get java file
    vio_grp_ran['sample_subfolder'] = subfolder
    # append the data frame to a list
    grp_dfs.append(vio_grp_ran)
    
    
    ## Make a folder in that directory
    folder = 'sample_sptbg'
    # output: path/to/Post.csv => path/to
    mkdir_cmd = 'mkdir {}'.format(folder)
    cmd = sp.run(
        mkdir_cmd, # command
        capture_output=True,
        text=True,
        shell=True
    )
    
    ## Make a folder in that directory
    folder = 'sample_sptbg/{}'.format(subfolder)
    # output: path/to/Post.csv => path/to
    mkdir_cmd = 'mkdir {}'.format(folder)
    cmd = sp.run(
        mkdir_cmd, # command
        capture_output=True,
        text=True,
        shell=True
    )
    
    # copy list of sample java files into the destination 
    for filename_to_copy in vio_grp_ran['sourcefile']:
        file_to_copy = '{}'.format(filename_to_copy)
        copy_file_cmd = 'cp codesnippets_java/{} {}'.format(file_to_copy, folder)
        cmd = sp.run(
            copy_file_cmd, # command
            capture_output=True,
            text=True,
            shell=True
        )
    
    
    

### Observed Test

In [21]:
# Combine all the group dataframe
com_grp_dfs = pd.concat(grp_dfs)
com_grp_dfs.head()

Unnamed: 0,category,type,sourcefile,shortMsg,bugline,sample_subfolder
10,PERFORMANCE,DM_STRING_CTOR,Code_7114630_7114546_2987_0.java,Method invokes inefficient new String(String) ...,line 2,dm_string_ctor
4,PERFORMANCE,DM_STRING_CTOR,Code_19938540_19938482_2572_1.java,Method invokes inefficient new String(String) ...,line 2,dm_string_ctor
7,PERFORMANCE,DM_STRING_CTOR,Code_43791827_43791584_1968_0.java,Method invokes inefficient new String(String) ...,line 2,dm_string_ctor
1,PERFORMANCE,DM_STRING_CTOR,Code_20166109_9607903_2381_12.java,Method invokes inefficient new String(String) ...,line 11,dm_string_ctor
7,PERFORMANCE,DM_STRING_CTOR,Code_40367841_4570037_54_8.java,Method invokes inefficient new String(String) ...,line 2,dm_string_ctor


### Sort them in order

In [22]:
#com_grp_dfs.sort_values('sourcefile', ascending=True, inplace=True)
com_grp_dfs.sort_values(['sample_subfolder','sourcefile'], ascending=[True, True], inplace=True)

In [23]:
com_grp_dfs.head()#com_grp_dfs.sort_values('sourcefile', ascending=True, inplace=True)
com_grp_dfs.sort_values(['sample_subfolder','sourcefile'], ascending=[True, True], inplace=True)

In [25]:
for cat, subfol, name, ln, msg in zip(com_grp_dfs['category'], com_grp_dfs['sample_subfolder'], com_grp_dfs['sourcefile'], com_grp_dfs['bugline'], com_grp_dfs['shortMsg']):
    print('sample_sptbg/{}/{} : {} ==> {} ({})'.format(subfol, name, ln, msg, cat))

sample_sptbg/bx_unboxing_immediately_reboxed/Code_33182091_33177256_1548_8.java : line 3 ==> Boxed value is unboxed and then immediately reboxed (PERFORMANCE)
sample_sptbg/dm_boxed_primitive_for_compare/Code_38933380_38933098_721_1.java : line 7 ==> Boxing a primitive to compare (PERFORMANCE)
sample_sptbg/dm_boxed_primitive_for_parsing/Code_13066419_13066225_2049_0.java : lines 1-2 ==> Boxing/unboxing to parse a primitive (PERFORMANCE)
sample_sptbg/dm_boxed_primitive_for_parsing/Code_23017269_23014428_350_0.java : lines 1-8 ==> Boxing/unboxing to parse a primitive (PERFORMANCE)
sample_sptbg/dm_boxed_primitive_for_parsing/Code_58049037_58048233_1212_0.java : line 29 ==> Boxing/unboxing to parse a primitive (PERFORMANCE)
sample_sptbg/dm_boxed_primitive_for_parsing/Code_587564_587280_493_4.java : line 2 ==> Unread field (PERFORMANCE)
sample_sptbg/dm_boxed_primitive_for_parsing/Code_9722839_9722360_2045_0.java : line 6 ==> Unread field (PERFORMANCE)
sample_sptbg/dm_boxed_primitive_tostring