In [1]:
import time
from datetime import timedelta
import html

import pandas as pd
import dask
import dask.dataframe as dd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

## Read csv error files

In [2]:
pmd_codeerrors_df = dd.read_csv('pmdcodesnippetserrors_csv/PMDJavaCodeSnippetsErrors*.csv', engine='python', error_bad_lines=False, warn_bad_lines=False, dtype=object)


In [3]:
pmd_codeerrors_df

Unnamed: 0_level_0,filename,msg,error_text
npartitions=356,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,object,object,object
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


## Explore one partition

In [4]:
one_pat_df = pmd_codeerrors_df.partitions[1].compute()

In [5]:
# see columns
one_pat_df.columns

Index(['filename', 'msg', 'error_text'], dtype='object')

In [6]:
# rename columns
one_pat_df.columns = ['filepath', 'error_msg', 'error_txt'] 

In [7]:
# see columns
one_pat_df.columns

Index(['filepath', 'error_msg', 'error_txt'], dtype='object')

In [8]:
one_pat_df.head()

Unnamed: 0,filepath,error_msg,error_txt
0,/scale_wlg_nobackup/filesets/nobackup/uoo03396...,PMDException: Error while parsing /scale_wlg_n...,net.sourceforge.pmd.PMDException: Error while ...
1,/scale_wlg_nobackup/filesets/nobackup/uoo03396...,PMDException: Error while parsing /scale_wlg_n...,net.sourceforge.pmd.PMDException: Error while ...
2,/scale_wlg_nobackup/filesets/nobackup/uoo03396...,PMDException: Error while parsing /scale_wlg_n...,net.sourceforge.pmd.PMDException: Error while ...
3,/scale_wlg_nobackup/filesets/nobackup/uoo03396...,PMDException: Error while parsing /scale_wlg_n...,net.sourceforge.pmd.PMDException: Error while ...
4,/scale_wlg_nobackup/filesets/nobackup/uoo03396...,PMDException: Error while parsing /scale_wlg_n...,net.sourceforge.pmd.PMDException: Error while ...


In [9]:
one_pat_df.tail()

Unnamed: 0,filepath,error_msg,error_txt
2255,/scale_wlg_nobackup/filesets/nobackup/uoo03396...,PMDException: Error while parsing /scale_wlg_n...,net.sourceforge.pmd.PMDException: Error while ...
2256,/scale_wlg_nobackup/filesets/nobackup/uoo03396...,PMDException: Error while parsing /scale_wlg_n...,net.sourceforge.pmd.PMDException: Error while ...
2257,/scale_wlg_nobackup/filesets/nobackup/uoo03396...,PMDException: Error while parsing /scale_wlg_n...,net.sourceforge.pmd.PMDException: Error while ...
2258,/scale_wlg_nobackup/filesets/nobackup/uoo03396...,PMDException: Error while parsing /scale_wlg_n...,net.sourceforge.pmd.PMDException: Error while ...
2259,/scale_wlg_nobackup/filesets/nobackup/uoo03396...,PMDException: Error while parsing /scale_wlg_n...,net.sourceforge.pmd.PMDException: Error while ...


In [12]:
one_pat_df.loc[0, 'error_msg']

'PMDException: Error while parsing /scale_wlg_nobackup/filesets/nobackup/uoo03396/SfTI_project_nobackup/SfTI_Projects/StackOverflow_project/my_codesnippet_analysis/codesnippets_java/Code_10000056_9999581_707_0.java'

In [11]:
one_pat_df.loc[0, 'error_txt']

'net.sourceforge.pmd.PMDException: Error while parsing /scale_wlg_nobackup/filesets/nobackup/uoo03396/SfTI_project_nobackup/SfTI_Projects/StackOverflow_project/my_codesnippet_analysis/codesnippets_java/Code_10000056_9999581_707_0.java\n\tat net.sourceforge.pmd.SourceCodeProcessor.processSourceCodeWithoutCache(SourceCodeProcessor.java:124)\n\tat net.sourceforge.pmd.SourceCodeProcessor.processSourceCode(SourceCodeProcessor.java:100)\n\tat net.sourceforge.pmd.SourceCodeProcessor.processSourceCode(SourceCodeProcessor.java:62)\n\tat net.sourceforge.pmd.processor.PmdRunnable.call(PmdRunnable.java:85)\n\tat net.sourceforge.pmd.processor.PmdRunnable.call(PmdRunnable.java:29)\n\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadP

In [15]:
one_pat_df.loc[2257, 'error_txt']

'net.sourceforge.pmd.PMDException: Error while parsing /scale_wlg_nobackup/filesets/nobackup/uoo03396/SfTI_project_nobackup/SfTI_Projects/StackOverflow_project/my_codesnippet_analysis/codesnippets_java/Code_10105301_10103998_3090_0.java\n\tat net.sourceforge.pmd.SourceCodeProcessor.processSourceCodeWithoutCache(SourceCodeProcessor.java:124)\n\tat net.sourceforge.pmd.SourceCodeProcessor.processSourceCode(SourceCodeProcessor.java:100)\n\tat net.sourceforge.pmd.SourceCodeProcessor.processSourceCode(SourceCodeProcessor.java:62)\n\tat net.sourceforge.pmd.processor.PmdRunnable.call(PmdRunnable.java:85)\n\tat net.sourceforge.pmd.processor.PmdRunnable.call(PmdRunnable.java:29)\n\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n\tat java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)\n\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.Threa

## Extract just the file names into a new column

In [50]:
# regex to extract filename from file path
#https://stackoverflow.com/questions/58181729/how-to-extract-the-filename-from-a-string-using-regular-expression
one_pat_df['filename'] = one_pat_df.filepath.str.extract(r'([^\/]+(?=\.))', flags=0, expand=True)

In [52]:
one_pat_df.head()

Unnamed: 0,filepath,error_msg,error_txt,filename
0,/scale_wlg_nobackup/filesets/nobackup/uoo03396...,PMDException: Error while parsing /scale_wlg_n...,net.sourceforge.pmd.PMDException: Error while ...,Code_10000056_9999581_707_0
1,/scale_wlg_nobackup/filesets/nobackup/uoo03396...,PMDException: Error while parsing /scale_wlg_n...,net.sourceforge.pmd.PMDException: Error while ...,Code_10000056_9999581_707_1
2,/scale_wlg_nobackup/filesets/nobackup/uoo03396...,PMDException: Error while parsing /scale_wlg_n...,net.sourceforge.pmd.PMDException: Error while ...,Code_10000227_10000008_264_2
3,/scale_wlg_nobackup/filesets/nobackup/uoo03396...,PMDException: Error while parsing /scale_wlg_n...,net.sourceforge.pmd.PMDException: Error while ...,Code_10000284_9996647_708_0
4,/scale_wlg_nobackup/filesets/nobackup/uoo03396...,PMDException: Error while parsing /scale_wlg_n...,net.sourceforge.pmd.PMDException: Error while ...,Code_10000497_10000468_1606_0


In [53]:
one_pat_df.shape[0]

2260

#### Get the unique classes dataframe

In [None]:
# get the unique error messages
class_one_pat_df_df = pmd_codeviolations_df['class'].drop_duplicates()

In [None]:
nn

## Start a Dask cluster using SLURM jobs as workers

In [None]:
#http://jobqueue.dask.org/en/latest/generated/dask_jobqueue.SLURMCluster.html
dask.config.set(
    {
        "distributed.worker.memory.target": False,  # avoid spilling to disk
        "distributed.worker.memory.spill": False,  # avoid spilling to disk
    }
)
cluster = SLURMCluster(
    cores=10, #cores=24, # we set each job to have 1 Worker, each using 10 cores (threads) and 8 GB of memory
    processes=2,
    memory="8GiB",
    walltime="0-00:30",# walltime="0-00:50",
    log_directory="../dask/logs",  # folder for SLURM logs for each worker
    local_directory="../dask",  # folder for workers data
)

Spawn between 20 to 100 workers and connect a client to be able use them.

In [None]:
#cluster.scale(n=20) # ask for 20 jobs or workers
# This also works with adaptive clusters. This automatically launches and kill workers based on load.
# we tell our cluster to autoscale between 10 and 20 workers depending on the load
cluster.adapt(minimum_jobs=10, maximum_jobs=200)
#cluster.adapt(maximum_jobs=20)
client = Client(cluster)
client

In [None]:
pmd_codeviolations_init_len = pmd_codeerrors_df.shape[0].compute()
print('Initial number of errors {}'.format(pmd_codeviolations_init_len))

In [None]:
pmd_codeviolations_nonan_len =pmd_codeviolations_df.shape[0].compute()
print('Number of violations after removing the nans in the class column {}'.format(pmd_codeviolations_nonan_len))

In [None]:
print('Difference {}'.format(pmd_codeviolations_init_len - pmd_codeviolations_nonan_len))

In [None]:
# drop classes that do not have this pattern e.g., Code_10000096_9914015_1959_0	
pmd_codeviolations_df = pmd_codeviolations_df[pmd_codeviolations_df['class'].str.contains('^Code_\d+_\d+_\d+_\d+', regex=True)]

In [None]:
pmd_codeviolations_unkownclass_len = pmd_codeviolations_df.shape[0].compute()
print('Number of violations after removing the onces caused by unknown class e.g. inner classes {}'.format(pmd_codeviolations_nonan_len))

In [None]:
print('Difference {}'.format(pmd_codeviolations_nonan_len - pmd_codeviolations_unkownclass_len))

In [None]:
# We did not name our classes following the naming convensions 
# hence we will remove all the records that have Our class name mentioned in the ClassNamingConensions 
# - The logic for using this is that the class may contain contain a subclass that is not following the naming convension
pmd_codeviolations_pgmr_df = pmd_codeviolations_df[
    ~pmd_codeviolations_df['text'].str.contains('^Code_\d+_\d+_\d+_\d+', regex=True) & 
    pmd_codeviolations_df['rule'].str.contains('ClassNamingConventions')
]

In [None]:
pmd_codeviolations_pgmr_len = pmd_codeviolations_pgmr_df.shape[0].compute()
print('Number of violations caused by programmer {}'.format(pmd_codeviolations_pgmr_len))

In [None]:
print('Difference {}'.format(pmd_codeviolations_unkownclass_len - pmd_codeviolations_pgmr_len))

In [None]:
# Get all the dataset that does not apply to our naming convension cases
#df = df[~(df['class'].isin(df2['class'].compute())&df['rule'].isin(df2['rule'].compute())&df['ruleset'].isin(df2['ruleset'].compute()))]
pmd_codeviolations_df = pmd_codeviolations_df[~(
    pmd_codeviolations_df['class'].isin(pmd_codeviolations_pgmr_df['class'].compute()) & 
    pmd_codeviolations_df['rule'].isin(pmd_codeviolations_pgmr_df['rule'].compute()) & 
    pmd_codeviolations_df['ruleset'].isin(pmd_codeviolations_pgmr_df['ruleset'].compute())
)]

In [None]:
pmd_codeviolations_len = pmd_codeviolations_df.shape[0].compute()
print('Number of violations caused by programmer {}'.format(pmd_codeviolations_len))

In [None]:
print('Difference {}'.format(pmd_codeviolations_len- pmd_codeviolations_pgmr_len))

In [None]:
#count the unique classes
pmd_codeviolations_df.groupby('class').count().shape[0].compute()

In [None]:
class_df = pmd_codeviolations_df['class'].drop_duplicates().compute()

In [None]:
class_df.shape[0]

In [None]:
#save it in a csv file
class_df.to_csv('pmdviolationsclassnames_csv/PMDViolationsClassNames*.csv', sep=',', index=False)

In [None]:
grp_df = pmd_codeviolations_df.groupby(['class', 'rule', ])

In [None]:
grp_df.first()

In [None]:
grp_df.first().compute()