In [1]:
import time
from datetime import timedelta

import html
import re

import os
import ntpath

import subprocess as sp

import numpy as np
import pandas as pd
import xml.etree.ElementTree as et

import dask
import dask.dataframe as dd
import dask.bag as bd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

In [2]:
common_path = '../my_codesnippet_analysis'
xml_path = 'pmdcodesnippetsviolation_csv/pmdCodeSnippetsViolation*.csv'
xml_file = '{}/{}'.format(common_path, xml_path)

## Read csv files Java Answers

In [3]:
df = dd.read_csv(xml_file, engine='python', error_bad_lines=False, warn_bad_lines=False, dtype=object)

## Start a Dask cluster using SLURM jobs as workers

In [4]:
#http://jobqueue.dask.org/en/latest/generated/dask_jobqueue.SLURMCluster.html
dask.config.set(
    {
        "distributed.worker.memory.target": False,  # avoid spilling to disk
        "distributed.worker.memory.spill": False,  # avoid spilling to disk
    }
)
cluster = SLURMCluster(
    cores=10, #cores=24, # we set each job to have 1 Worker, each using 10 cores (threads) and 8 GB of memory
    processes=2,
    memory="8GiB",
    walltime="0-00:30",# walltime="0-00:50",
    log_directory="../dask/logs",  # folder for SLURM logs for each worker
    local_directory="../dask",  # folder for workers data
)

Spawn between 20 to 100 workers and connect a client to be able use them.

In [5]:
#cluster.scale(n=20) # ask for 20 jobs or workers
# This also works with adaptive clusters. This automatically launches and kill workers based on load.
# we tell our cluster to autoscale between 10 and 20 workers depending on the load
cluster.adapt(minimum_jobs=10, maximum_jobs=200)
#cluster.adapt(maximum_jobs=20)
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://192.168.94.140:39012  Dashboard: http://192.168.94.140:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [6]:
df

Unnamed: 0_level_0,beginline,endline,begincolumn,endcolumn,rule,ruleset,class,externalInfoUrl,priority,msg,method
npartitions=1143,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
,object,object,object,object,object,object,object,object,object,object,object
,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...


In [7]:
pmd_codeviolations_init_len = df.shape[0].compute()
print('Initial number of volations {}'.format(pmd_codeviolations_init_len))

Initial number of volations 5750467


In [8]:
# drop empty columns
df = df[~df['class'].isna()] # Drop rows that have NaN in the Code column

In [9]:
pmd_codeviolations_nonan_len =df.shape[0].compute()
print('Number of violations after removing the nans in the class column {}'.format(pmd_codeviolations_nonan_len))

Number of violations after removing the nans in the class column 5739333


In [10]:
print('Difference {}'.format(pmd_codeviolations_init_len - pmd_codeviolations_nonan_len))

Difference 11134


In [11]:
# drop classes that do not have this pattern e.g., Code_10000096_9914015_1959_0
#df = df[df['class'].str.contains('^Code_\d+_\d+_\d+_\d+', regex=True)]
df = df[df['class'].str.contains('^Code_\d+_\d+_\d+_\d+$', regex=True)]

In [12]:
pmd_codeviolations_unkownclass_len = df.shape[0].compute()
print('Number of violations after removing the onces caused by unknown class e.g. inner classes {}'.format(pmd_codeviolations_nonan_len))

Number of violations after removing the onces caused by unknown class e.g. inner classes 5739333


In [13]:
print('Difference {}'.format(pmd_codeviolations_nonan_len - pmd_codeviolations_unkownclass_len))

Difference 595960


#### Count the unique classes
- Classes that passed the pmd static analysis with violations

In [14]:
#count the unique classes
df.groupby('class').count().shape[0].compute()

846052

#### Get the unique classes dataframe

In [15]:
# get the unique classes
df = df['class'].drop_duplicates().compute()

In [17]:
# This also counts the unique classes
print('Java code snippets that passed the pmd static analysis with violations {}'.format(df.shape[0]))


Java code snippets that passed the pmd static analysis with violations 846052


In [18]:
#convert series to dataframe
df = pd.DataFrame(df)

In [19]:
df.columns = ['classname']

In [20]:
df.head()

Unnamed: 0,classname
0,Code_10000096_9914015_73_1
4,Code_1001005_1000723_311_0
27,Code_10027533_10027469_197_0
30,Code_10032837_10029678_3_2
31,Code_10059295_10055336_82_7


In [21]:
df['IdxM'] = df['classname'].str.extract(r'(\d+_\d+_\d+_\d+)', flags=0, expand=True)

In [22]:
df['Idx'] = df['classname'].str.extract(r'(\d+_\d+_\d+)', flags=0, expand=True)

In [23]:
df['match'] = df['classname'].str.extract(r'(\d+$)', flags=0, expand=True)

In [24]:
df.head()

Unnamed: 0,classname,IdxM,Idx,match
0,Code_10000096_9914015_73_1,10000096_9914015_73_1,10000096_9914015_73,1
4,Code_1001005_1000723_311_0,1001005_1000723_311_0,1001005_1000723_311,0
27,Code_10027533_10027469_197_0,10027533_10027469_197_0,10027533_10027469_197,0
30,Code_10032837_10029678_3_2,10032837_10029678_3_2,10032837_10029678_3,2
31,Code_10059295_10055336_82_7,10059295_10055336_82_7,10059295_10055336_82,7


### Make a folder in that directory

In [25]:
## Make a folder in that directory
folder = '{}/pmdviolationscodesnippetsnames_csv'.format(common_path)
# output: path/to/Post.csv => path/to
mkdir_cmd = 'mkdir {}'.format(folder)
cmd = sp.run(
    mkdir_cmd, # command
    capture_output=True,
    text=True,
    shell=True
)

### Save files in that directory

In [26]:
## Save files in that directory
filename = 'pmdViolationsCodeSnippetsNames'
file = '{}/{}*.csv'.format(folder, filename)
_ = df.to_csv(file, sep=',', index=False)