In [1]:
import time
from datetime import timedelta
import html

import pandas as pd
import dask
import dask.dataframe as dd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

## Read csv error files

In [2]:
df = dd.read_csv('pmdcodesnippetserrors_csv/PMDJavaCodeSnippetsErrors*.csv', engine='python', error_bad_lines=False, warn_bad_lines=False, dtype=object)


#### Explore dataframe

In [3]:
## explore one patition
#one_pat_df = df.partitions[1].compute()

In [4]:
# see dataframe structure
df

Unnamed: 0_level_0,filename,msg,error_text
npartitions=356,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,object,object,object
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


In [5]:
# see columns
df.columns

Index(['filename', 'msg', 'error_text'], dtype='object')

In [6]:
# rename columns
df.columns = ['filepath', 'error_msg', 'error_txt'] 

In [7]:
# see columns
df.columns

Index(['filepath', 'error_msg', 'error_txt'], dtype='object')

In [8]:
#df.head()

In [9]:
#df.tail()

## Start a Dask cluster using SLURM jobs as workers

In [10]:
#http://jobqueue.dask.org/en/latest/generated/dask_jobqueue.SLURMCluster.html
dask.config.set(
    {
        "distributed.worker.memory.target": False,  # avoid spilling to disk
        "distributed.worker.memory.spill": False,  # avoid spilling to disk
    }
)
cluster = SLURMCluster(
    cores=10, #cores=24, # we set each job to have 1 Worker, each using 10 cores (threads) and 8 GB of memory
    processes=2,
    memory="8GiB",
    walltime="0-00:30",# walltime="0-00:50",
    log_directory="../dask/logs",  # folder for SLURM logs for each worker
    local_directory="../dask",  # folder for workers data
)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 40877 instead


Spawn between 20 to 100 workers and connect a client to be able use them.

In [11]:
#cluster.scale(n=20) # ask for 20 jobs or workers
# This also works with adaptive clusters. This automatically launches and kill workers based on load.
# we tell our cluster to autoscale between 10 and 20 workers depending on the load
cluster.adapt(minimum_jobs=5, maximum_jobs=10)
#cluster.adapt(maximum_jobs=20)
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://192.168.94.166:45566  Dashboard: http://192.168.94.166:40877/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [12]:
pmd_codeerror_init_len = df.shape[0].compute()
print('Initial number of errors {}'.format(pmd_codeerror_init_len))

Initial number of errors 1217373


In [13]:
# drop empty columns
df = df[~df['filepath'].isna()] # Drop rows that have NaN in the Code column

In [14]:
pmd_codeviolations_nonan_len =df.shape[0].compute()
print('Number of errors after removing the nans in the class column {}'.format(pmd_codeerror_init_len))

Number of errors after removing the nans in the class column 1217373


## Extract just the file names into a new column

In [15]:
# regex to extract filename from file path
#https://stackoverflow.com/questions/58181729/how-to-extract-the-filename-from-a-string-using-regular-expression
#df['filename'] = df.filepath.str.extract(r'([^\/]+(?=\.))', flags=0, expand=True).compute()
#df['filename'] = df.filepath.str.extract(r'(([^\/]+)\.)', flags=0, expand=True).compute()
df2 = df.filepath.str.extract(r'(([^\/]+)\.)', flags=0, expand=True).compute()

In [16]:
df2.head()

Unnamed: 0,0,1
0,Code_10000052_9999270_706_1.,Code_10000052_9999270_706_1
0,Code_10000056_9999581_707_0.,Code_10000056_9999581_707_0
1,Code_10000056_9999581_707_1.,Code_10000056_9999581_707_1
2,Code_10000227_10000008_264_2.,Code_10000227_10000008_264_2
3,Code_10000284_9996647_708_0.,Code_10000284_9996647_708_0


In [17]:
df2 = df2[[1]]

In [18]:
df2.head()

Unnamed: 0,1
0,Code_10000052_9999270_706_1
0,Code_10000056_9999581_707_0
1,Code_10000056_9999581_707_1
2,Code_10000227_10000008_264_2
3,Code_10000284_9996647_708_0


In [19]:
df2.columns = ['classname']

In [20]:
df2['IdxM'] = df2['classname'].str.extract(r'(\d+_\d+_\d+_\d+)', flags=0, expand=True)

In [21]:
df2['Idx'] = df2['classname'].str.extract(r'(\d+_\d+_\d+)', flags=0, expand=True)

In [22]:
df2['match'] = df2['classname'].str.extract(r'(\d+$)', flags=0, expand=True)

In [23]:
df2.head()

Unnamed: 0,classname,IdxM,Idx,match
0,Code_10000052_9999270_706_1,10000052_9999270_706_1,10000052_9999270_706,1
0,Code_10000056_9999581_707_0,10000056_9999581_707_0,10000056_9999581_707,0
1,Code_10000056_9999581_707_1,10000056_9999581_707_1,10000056_9999581_707,1
2,Code_10000227_10000008_264_2,10000227_10000008_264_2,10000227_10000008_264,2
3,Code_10000284_9996647_708_0,10000284_9996647_708_0,10000284_9996647_708,0


In [24]:
df2.tail()

Unnamed: 0,classname,IdxM,Idx,match
3346,Code_9999698_9981080_1096_0,9999698_9981080_1096_0,9999698_9981080_1096,0
3347,Code_9999779_9999743_75_0,9999779_9999743_75_0,9999779_9999743_75,0
3348,Code_9999843_9999539_1366_0,9999843_9999539_1366_0,9999843_9999539_1366,0
3349,Code_9999951_9999782_703_4,9999951_9999782_703_4,9999951_9999782_703,4
3350,Code_9999966_9999642_704_0,9999966_9999642_704_0,9999966_9999642_704,0


In [25]:
pmd_codeerror_len =df2.shape[0]
print('Number of errors after removing the nans in the class column {}'.format(pmd_codeerror_len))

Number of errors after removing the nans in the class column 1217373


In [26]:
# drop classes that do not have this pattern e.g., Code_10000096_9914015_1959_0	
df2 = df2[df2['classname'].str.contains('^Code_\d+_\d+_\d+_\d+$', regex=True)]

In [27]:
df2.shape[0]

1217373

In [28]:
df2.shape[0]

1217373

#### Get the unique classes dataframe

In [29]:
# get the unique classes
#df2 = df2['classname'].drop_duplicates()
df2 = df2.drop_duplicates()

In [30]:
df2.shape[0]

1217373

In [31]:
#save it in a csv file
df2.to_csv('pmderrorsfilenames_csv/PMDErrorsFilenames1.csv', sep=',', index=False)