In [1]:
import time
from datetime import timedelta
import html

import pandas as pd
import dask
import dask.dataframe as dd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

## Read csv files Java Answers

In [2]:
df = dd.read_csv('pmdcodesnippetsviolation_csv/pmdCodeSnippetsViolation*.csv', engine='python', error_bad_lines=False, warn_bad_lines=False, dtype=object)


#### Explore dataframe

In [3]:
## explore one patition
#one_pat_df = df.partitions[1].compute()

In [4]:
df

Unnamed: 0_level_0,beginline,endline,begincolumn,endcolumn,rule,ruleset,class,externalInfoUrl,priority,msg,method
npartitions=577,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
,object,object,object,object,object,object,object,object,object,object,object
,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...


## Start a Dask cluster using SLURM jobs as workers

In [5]:
#http://jobqueue.dask.org/en/latest/generated/dask_jobqueue.SLURMCluster.html
dask.config.set(
    {
        "distributed.worker.memory.target": False,  # avoid spilling to disk
        "distributed.worker.memory.spill": False,  # avoid spilling to disk
    }
)
cluster = SLURMCluster(
    cores=10, #cores=24, # we set each job to have 1 Worker, each using 10 cores (threads) and 8 GB of memory
    processes=2,
    memory="8GiB",
    walltime="0-00:30",# walltime="0-00:50",
    log_directory="../dask/logs",  # folder for SLURM logs for each worker
    local_directory="../dask",  # folder for workers data
)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 36861 instead


Spawn between 20 to 100 workers and connect a client to be able use them.

In [6]:
#cluster.scale(n=20) # ask for 20 jobs or workers
# This also works with adaptive clusters. This automatically launches and kill workers based on load.
# we tell our cluster to autoscale between 10 and 20 workers depending on the load
cluster.adapt(minimum_jobs=10, maximum_jobs=200)
#cluster.adapt(maximum_jobs=20)
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://192.168.94.156:33516  Dashboard: http://192.168.94.156:36861/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [7]:
pmd_codeviolations_init_len = df.shape[0].compute()
print('Initial number of volations {}'.format(pmd_codeviolations_init_len))

Initial number of volations 221495


In [8]:
# drop empty columns
df = df[~df['class'].isna()] # Drop rows that have NaN in the Code column

In [9]:
pmd_codeviolations_nonan_len =df.shape[0].compute()
print('Number of violations after removing the nans in the class column {}'.format(pmd_codeviolations_nonan_len))

Number of violations after removing the nans in the class column 214049


In [10]:
print('Difference {}'.format(pmd_codeviolations_init_len - pmd_codeviolations_nonan_len))

Difference 7446


In [11]:
# drop classes that do not have this pattern e.g., Code_10000096_9914015_1959_0	
df = df[df['class'].str.contains('^Code_\d+_\d+_\d+_\d+$', regex=True)]

In [12]:
pmd_codeviolations_unkownclass_len = df.shape[0].compute()
print('Number of violations after removing the onces caused by unknown class e.g. inner classes {}'.format(pmd_codeviolations_nonan_len))

Number of violations after removing the onces caused by unknown class e.g. inner classes 214049


In [13]:
print('Difference {}'.format(pmd_codeviolations_nonan_len - pmd_codeviolations_unkownclass_len))

Difference 23308


In [None]:
# We did not name our classes following the naming convensions 
# hence we will remove all the records that have Our class name mentioned in the ClassNamingConensions 
# - The logic for using this is that the class may contain contain a subclass that is not following the naming convension
pgmr_vi_df = df[
    ~df['text'].str.contains('^Code_\d+_\d+_\d+_\d+$', regex=True) & 
    df['rule'].str.contains('ClassNamingConventions')
]

In [None]:
pmd_codeviolations_pgmr_len = pgmr_vi_df.shape[0].compute()
print('Number of violations caused by programmer {}'.format(pmd_codeviolations_pgmr_len))

In [None]:
print('Difference {}'.format(pmd_codeviolations_unkownclass_len - pmd_codeviolations_pgmr_len))

In [None]:
# Get all the dataset that does not apply to our naming convension cases
#df = df[~(df['class'].isin(df2['class'].compute())&df['rule'].isin(df2['rule'].compute())&df['ruleset'].isin(df2['ruleset'].compute()))]
df = df[~(
    df['class'].isin(pgmr_vi_df['class'].compute()) & 
    df['rule'].isin(pgmr_vi_df['rule'].compute()) & 
    df['ruleset'].isin(pgmr_vi_df['ruleset'].compute())
)]

In [None]:
pmd_codeviolations_len = df.shape[0].compute()
print('Number of violations caused by programmer {}'.format(pmd_codeviolations_len))

In [None]:
print('Difference {}'.format(pmd_codeviolations_len- pmd_codeviolations_pgmr_len))

In [None]:
df.head()

#### Count the unique classes

In [None]:
#count the unique classes
df.groupby('class').count().shape[0].compute()

#### Get the unique classes dataframe

In [None]:
# get the unique classes
class_df = df['class'].drop_duplicates().compute()

In [None]:
# This also counts the unique classes
class_df.shape[0]

In [None]:
#convert series to dataframe
class_df = pd.DataFrame(class_df)

In [None]:
class_df.columns = ['classname']

In [None]:
class_df.head()

In [None]:
class_df['IdxM'] = class_df['classname'].str.extract(r'(\d+_\d+_\d+_\d+)', flags=0, expand=True)

In [None]:
class_df['Idx'] = class_df['classname'].str.extract(r'(\d+_\d+_\d+)', flags=0, expand=True)

In [None]:
class_df['match'] = class_df['classname'].str.extract(r'(\d+$)', flags=0, expand=True)

In [None]:
class_df.head()

In [None]:
class_df.tail()

## Save the unique classes into a dataframe

In [None]:
#save it in a csv file
#class_df.to_csv('pmdviolationsfilenames_csv/PMDViolationsFilenames1.csv', sep=',', index=False)