In [1]:
import time
from datetime import timedelta
import html

import pandas as pd
import dask
import dask.dataframe as dd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

## Read complete codesnippets csv files

In [2]:
cs_df = dd.read_csv('codesnippets_csv/JavaCodeSnippets*.csv', engine='python', error_bad_lines=False, warn_bad_lines=False, dtype=object)


In [3]:
er_df = dd.read_csv('pmderrorsfilenames_csv/PMDErrorsFilenames*.csv', engine='python', error_bad_lines=False, warn_bad_lines=False, dtype=object)


In [4]:
#vo_df = dd.read_csv('pmdviolationsfilenames_csv/PMDViolationsFilenames*.csv', engine='python', error_bad_lines=False, warn_bad_lines=False, dtype=object)


In [5]:
cs_df

Unnamed: 0_level_0,Idx,match,Code
npartitions=2530,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,object,object,object
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


In [6]:
er_df

Unnamed: 0_level_0,classname,IdxM,Idx,match
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,object,object,object,object
,...,...,...,...
,...,...,...,...


In [7]:
cs_df.columns

Index(['Idx', 'match', 'Code'], dtype='object')

In [8]:
er_df.columns

Index(['classname', 'IdxM', 'Idx', 'match'], dtype='object')

In [9]:
cs_df.dtypes

Idx      object
match    object
Code     object
dtype: object

In [10]:
er_df.dtypes

classname    object
IdxM         object
Idx          object
match        object
dtype: object

## Start a Dask cluster using SLURM jobs as workers

In [11]:
#http://jobqueue.dask.org/en/latest/generated/dask_jobqueue.SLURMCluster.html
dask.config.set(
    {
        "distributed.worker.memory.target": False,  # avoid spilling to disk
        "distributed.worker.memory.spill": False,  # avoid spilling to disk
    }
)
cluster = SLURMCluster(
    cores=10, #cores=24, # we set each job to have 1 Worker, each using 10 cores (threads) and 8 GB of memory
    processes=2,
    memory="8GiB",
    walltime="0-00:30",# walltime="0-00:50",
    log_directory="../dask/logs",  # folder for SLURM logs for each worker
    local_directory="../dask",  # folder for workers data
)

Spawn between 20 to 100 workers and connect a client to be able use them.

In [12]:
#cluster.scale(n=20) # ask for 20 jobs or workers
# This also works with adaptive clusters. This automatically launches and kill workers based on load.
# we tell our cluster to autoscale between 10 and 20 workers depending on the load
cluster.adapt(minimum_jobs=20, maximum_jobs=200)
#cluster.adapt(maximum_jobs=20)
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://192.168.94.166:34236  Dashboard: http://192.168.94.166:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [13]:
one_pat_df = cs_df.partitions[600].compute()

In [14]:
one_pat_df.head()

Unnamed: 0,Idx,match,Code
0,34451727_34434556_1812,0,import java.util.regex.Matcher;\nimport java.u...
1,34452152_33726321_1824,0,import java.io.ByteArrayInputStream;\nimport j...
2,34452362_34452207_1231,7,public class Code_34452362_34452207_1231_7 {\n...
3,34452718_34452678_1233,0,class Code_34452718_34452678_1233_0 {\n dou...
4,34452813_13750010_282,0,package com.rest.jersey.jerseyclient;\nimport ...


In [15]:
one_pat_df.tail()

Unnamed: 0,Idx,match,Code
258,34487718_34328874_746,1,public class Code_34487718_34328874_746_1 exte...
259,3448776_3448030_3068,3,import java.awt.*;\nimport java.awt.event.*;\n...
260,34487791_34464728_397,1,public class Code_34487791_34464728_397_1 {\n ...
261,3448807_3445353_835,1,import java.util.ArrayList;\n\nclass Code_3448...
262,34488536_34487494_1323,3,public class Code_34488536_34487494_1323_3\n{\...


In [16]:
one_pat_df['IdxM'] = one_pat_df['Idx'].str.cat(one_pat_df['match'].astype(str),sep="_")

In [17]:
one_pat_df.head()

Unnamed: 0,Idx,match,Code,IdxM
0,34451727_34434556_1812,0,import java.util.regex.Matcher;\nimport java.u...,34451727_34434556_1812_0
1,34452152_33726321_1824,0,import java.io.ByteArrayInputStream;\nimport j...,34452152_33726321_1824_0
2,34452362_34452207_1231,7,public class Code_34452362_34452207_1231_7 {\n...,34452362_34452207_1231_7
3,34452718_34452678_1233,0,class Code_34452718_34452678_1233_0 {\n dou...,34452718_34452678_1233_0
4,34452813_13750010_282,0,package com.rest.jersey.jerseyclient;\nimport ...,34452813_13750010_282_0


### Generate the IdxM unique column
- A combination of the Idx and the match
- which also makes up the class name

In [18]:
cs_df['IdxM'] = cs_df['Idx'].str.cat(cs_df['match'].astype(str),sep="_")

In [19]:
er_df.head()

Unnamed: 0,classname,IdxM,Idx,match
0,Code_10000052_9999270_706_1,10000052_9999270_706_1,10000052_9999270_706,1
1,Code_10000056_9999581_707_0,10000056_9999581_707_0,10000056_9999581_707,0
2,Code_10000056_9999581_707_1,10000056_9999581_707_1,10000056_9999581_707,1
3,Code_10000227_10000008_264_2,10000227_10000008_264_2,10000227_10000008_264,2
4,Code_10000284_9996647_708_0,10000284_9996647_708_0,10000284_9996647_708,0


In [20]:
cs_df.shape[0].compute()

1953970

In [21]:
er_df.shape[0].compute()

1217373

In [22]:
# Get the dataframe that contains one line of java code
df = cs_df.loc[~cs_df.IdxM.isin(er_df.IdxM.compute())]

In [23]:
df.shape[0].compute()

736597

In [24]:
1953970 - 1217373

736597

In [25]:
df.head()

Unnamed: 0,Idx,match,Code,IdxM
0,10000096_9914015_1959,0,public class Code_10000096_9914015_1959_0 {\n\...,10000096_9914015_1959_0
1,10000096_9914015_1959,1,@XmlRootElement\npublic class Code_10000096_99...,10000096_9914015_1959_1
2,10000160_9999942_263,2,import SomeClassName;\n,10000160_9999942_263_2
3,10000439_9999961_2190,4,public class Code_10000439_9999961_2190_4\n{\n...,10000439_9999961_2190_4
4,10001108_10000983_1098,0,class Code_10001108_10000983_1098_0\n{\n pr...,10001108_10000983_1098_0


In [26]:
#tt = df[df.IdxM=='17170546_17166918_4765_1'].compute()

In [27]:
#tt

In [28]:
#tt['Code']

## Save all the codes from the posts into a CSV file

In [29]:
## Save all the codes from the posts into a CSV file

# Save to a CSV file
_ = df.to_csv('pmdpasscodesnippets_csv/JavaCodeSnippets*.csv', sep=',', index=False)

##  Write each record as a .java File into the pmdpasscodesnippets_java directory
- We write an MPI process to perform this task better
    - write_java_files_MPI.py