In [32]:
import time
from datetime import timedelta
import html
import subprocess as sp

import pandas as pd
import dask
import dask.dataframe as dd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

## Read csv error files

In [2]:
common_path = '../my_codesnippet_analysis'
#xml_path = 'pmdcodesnippetserrors_csv/PMDJavaCodeSnippetsErrors*.csv'
xml_path = 'pmdcodesnippetserror_csv/pmdCodeSnippetsError*.csv'
xml_file = '{}/{}'.format(common_path, xml_path)
ddf = dd.read_csv(xml_file, engine='python', error_bad_lines=False, warn_bad_lines=False, dtype=object)


#### Explore dataframe

In [3]:
## explore one patition
#one_pat_df = df.partitions[1].compute()

In [4]:
# see dataframe structure
ddf

Unnamed: 0_level_0,0
npartitions=1154,Unnamed: 1_level_1
,object
,...
...,...
,...
,...


In [5]:
# see columns
ddf.columns

Index(['0'], dtype='object')

In [6]:
# rename columns
#df.columns = ['filepath', 'error_msg', 'error_txt'] 
ddf.columns = ['filepath'] 

In [7]:
# see columns
ddf.columns

Index(['filepath'], dtype='object')

In [8]:
#ddf.head()

In [9]:
#ddf.tail()

## Start a Dask cluster using SLURM jobs as workers

In [10]:
#http://jobqueue.dask.org/en/latest/generated/dask_jobqueue.SLURMCluster.html
dask.config.set(
    {
        "distributed.worker.memory.target": False,  # avoid spilling to disk
        "distributed.worker.memory.spill": False,  # avoid spilling to disk
    }
)
cluster = SLURMCluster(
    cores=10, #cores=24, # we set each job to have 1 Worker, each using 10 cores (threads) and 8 GB of memory
    processes=2,
    memory="8GiB",
    walltime="0-00:30",# walltime="0-00:50",
    log_directory="../dask/logs",  # folder for SLURM logs for each worker
    local_directory="../dask",  # folder for workers data
)

Spawn between 20 to 100 workers and connect a client to be able use them.

In [11]:
#cluster.scale(n=20) # ask for 20 jobs or workers
# This also works with adaptive clusters. This automatically launches and kill workers based on load.
# we tell our cluster to autoscale between 10 and 20 workers depending on the load
cluster.adapt(minimum_jobs=5, maximum_jobs=10)
#cluster.adapt(maximum_jobs=20)
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://192.168.94.167:46156  Dashboard: http://192.168.94.167:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [12]:
pmd_codeerror_init_len = ddf.shape[0].compute()
print('Initial number of errors {}'.format(pmd_codeerror_init_len))

Initial number of errors 1444299


In [13]:
# drop empty columns
ddf = ddf[~ddf['filepath'].isna()] # Drop rows that have NaN in the Code column

In [14]:
pmd_codeviolations_nonan_len =ddf.shape[0].compute()
print('Number of errors after removing the nans in the class column {}'.format(pmd_codeerror_init_len))

Number of errors after removing the nans in the class column 1444299


## Extract just the file names into a new column

In [15]:
# regex to extract filename from file path
#https://stackoverflow.com/questions/58181729/how-to-extract-the-filename-from-a-string-using-regular-expression
#df['filename'] = df.filepath.str.extract(r'([^\/]+(?=\.))', flags=0, expand=True).compute()
#df['filename'] = df.filepath.str.extract(r'(([^\/]+)\.)', flags=0, expand=True).compute()
#df = df.filepath.str.extract(r'(([^\/]+)\.)', flags=0, expand=True).compute()

In [16]:
ddf.head()

Unnamed: 0,filepath
0,net.sourceforge.pmd.PMDException: Error while ...
1,net.sourceforge.pmd.PMDException: Error while ...
2,net.sourceforge.pmd.PMDException: Error while ...
3,net.sourceforge.pmd.PMDException: Error while ...
4,net.sourceforge.pmd.PMDException: Error while ...


In [17]:
#ddf = df[[1]]

In [18]:
ddf.head()

Unnamed: 0,filepath
0,net.sourceforge.pmd.PMDException: Error while ...
1,net.sourceforge.pmd.PMDException: Error while ...
2,net.sourceforge.pmd.PMDException: Error while ...
3,net.sourceforge.pmd.PMDException: Error while ...
4,net.sourceforge.pmd.PMDException: Error while ...


In [19]:
#ddf= ddf['filepath'].str.extract(r'(\d+_\d+_\d+_\d+)', flags=0, expand=True)
ddf= ddf['filepath'].str.extract(r'(Code_\d+_\d+_\d+_\d+)', flags=0, expand=True)

In [20]:
ddf.head()

Unnamed: 0,0
0,Code_10022570_10022548_382_1
1,Code_10051644_10051084_113_0
2,Code_10059662_9994872_109_3
3,Code_10104389_10103834_274_0
4,Code_10109082_10109035_54_0


In [21]:
# rename columns
ddf.columns = ['codesnippetsname'] 

In [23]:
# Use this if ddf it is still a Dask dataframe
def extract_info_4m_df(df):
    df['IdxM'] = df['codesnippetsname'].str.extract(r'(\d+_\d+_\d+_\d+)', flags=0, expand=True)
    df['Idx'] = df['codesnippetsname'].str.extract(r'(\d+_\d+_\d+)', flags=0, expand=True)
    df['match'] = df['codesnippetsname'].str.extract(r'(\d+$)', flags=0, expand=True)
    return df

ddf = ddf.map_partitions(extract_info_4m_df)
ddf.persist()

Unnamed: 0_level_0,codesnippetsname,IdxM,Idx,match
npartitions=1154,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,object,object,object,object
,...,...,...,...
...,...,...,...,...
,...,...,...,...
,...,...,...,...


In [27]:
ddf.head()

Unnamed: 0,codesnippetsname,IdxM,Idx,match
0,Code_10022570_10022548_382_1,10022570_10022548_382_1,10022570_10022548_382,1
1,Code_10051644_10051084_113_0,10051644_10051084_113_0,10051644_10051084_113,0
2,Code_10059662_9994872_109_3,10059662_9994872_109_3,10059662_9994872_109,3
3,Code_10104389_10103834_274_0,10104389_10103834_274_0,10104389_10103834_274,0
4,Code_10109082_10109035_54_0,10109082_10109035_54_0,10109082_10109035_54,0


In [28]:
ddf.shape[0].compute()

1444299

#### Get the unique classes dataframe

In [29]:
# get the unique classes
ddf = ddf.drop_duplicates()

In [30]:
print('Java code snippets that did not passed the pmd static analysis due to fatal errors {}'.format(ddf.shape[0].compute()))

Java code snippets that did not passed the pmd static analysis due to fatal errors 1444164


### Make a folder in that directory

In [33]:
## Make a folder in that directory
folder = '{}/pmderrorcodesnippetsnames_csv'.format(common_path)
# output: path/to/Post.csv => path/to
mkdir_cmd = 'mkdir {}'.format(folder)
cmd = sp.run(
    mkdir_cmd, # command
    capture_output=True,
    text=True,
    shell=True
)

### Save files in that directory

In [34]:
## Save files in that directory
filename = 'pmdErrorCodeSnippetsNames'
file = '{}/{}*.csv'.format(folder, filename)
_ = ddf.to_csv(file, sep=',', index=False)