In [1]:
import time
from datetime import timedelta
import html

import pandas as pd
import dask
import dask.dataframe as dd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

## Read csv files Java Answers

In [2]:
df = dd.read_csv('pmdcodesnippetsviolations_csv/PMDJavaCodeSnippetsViolations*.csv', engine='python', error_bad_lines=False, warn_bad_lines=False, dtype=object)


## Start a Dask cluster using SLURM jobs as workers

In [3]:
#http://jobqueue.dask.org/en/latest/generated/dask_jobqueue.SLURMCluster.html
dask.config.set(
    {
        "distributed.worker.memory.target": False,  # avoid spilling to disk
        "distributed.worker.memory.spill": False,  # avoid spilling to disk
    }
)
cluster = SLURMCluster(
    cores=10, #cores=24, # we set each job to have 1 Worker, each using 10 cores (threads) and 8 GB of memory
    processes=2,
    memory="8GiB",
    walltime="0-00:30",# walltime="0-00:50",
    log_directory="../dask/logs",  # folder for SLURM logs for each worker
    local_directory="../dask",  # folder for workers data
)

Spawn between 20 to 100 workers and connect a client to be able use them.

In [4]:
#cluster.scale(n=20) # ask for 20 jobs or workers
# This also works with adaptive clusters. This automatically launches and kill workers based on load.
# we tell our cluster to autoscale between 10 and 20 workers depending on the load
cluster.adapt(minimum_jobs=10, maximum_jobs=200)
#cluster.adapt(maximum_jobs=20)
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://192.168.94.166:42232  Dashboard: http://192.168.94.166:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [5]:
df

Unnamed: 0_level_0,beginline,endline,begincolumn,endcolumn,rule,ruleset,class,externalInfoUrl,priority,text
npartitions=204,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
,object,object,object,object,object,object,object,object,object,object
,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...


In [6]:
pmd_codeviolations_init_len = df.shape[0].compute()
print('Initial number of volations {}'.format(pmd_codeviolations_init_len))

Initial number of volations 4896537


In [7]:
# drop empty columns
df = df[~df['class'].isna()] # Drop rows that have NaN in the Code column

In [8]:
pmd_codeviolations_nonan_len =df.shape[0].compute()
print('Number of violations after removing the nans in the class column {}'.format(pmd_codeviolations_nonan_len))

Number of violations after removing the nans in the class column 4887262


In [9]:
print('Difference {}'.format(pmd_codeviolations_init_len - pmd_codeviolations_nonan_len))

Difference 9275


In [10]:
# drop classes that do not have this pattern e.g., Code_10000096_9914015_1959_0
df = df[df['class'].str.contains('^Code_\d+_\d+_\d+_\d+', regex=True)]

In [11]:
pmd_codeviolations_unkownclass_len = df.shape[0].compute()
print('Number of violations after removing the onces caused by unknown class e.g. inner classes {}'.format(pmd_codeviolations_nonan_len))

Number of violations after removing the onces caused by unknown class e.g. inner classes 4887262


In [12]:
print('Difference {}'.format(pmd_codeviolations_nonan_len - pmd_codeviolations_unkownclass_len))

Difference 198010


In [13]:
# We did not name our classes following the naming convensions 
# hence we will remove all the records that have Our class name mentioned in the ClassNamingConensions 
# - The logic for using this is that the class may contain contain a subclass that is not following the naming convension
pgmr_df = df[
    ~df['text'].str.contains('^Code_\d+_\d+_\d+_\d+', regex=True) & 
    df['rule'].str.contains('ClassNamingConventions')
]

In [14]:
pmd_codeviolations_pgmr_len = pgmr_df.shape[0].compute()
print('Number of violations caused by programmer {}'.format(pmd_codeviolations_pgmr_len))

Number of violations caused by programmer 731099


In [15]:
print('Difference {}'.format(pmd_codeviolations_unkownclass_len - pmd_codeviolations_pgmr_len))

Difference 3958153


In [16]:
# Get all the dataset that does not apply to our naming convension cases
#df = df[~(df['class'].isin(df2['class'].compute())&df['rule'].isin(df2['rule'].compute())&df['ruleset'].isin(df2['ruleset'].compute()))]
df = df[~(
    df['class'].isin(pgmr_df['class'].compute()) & 
    df['rule'].isin(pgmr_df['rule'].compute()) & 
    df['ruleset'].isin(pgmr_df['ruleset'].compute())
)]

In [17]:
pmd_codeviolations_len = df.shape[0].compute()
print('Number of violations caused by programmer {}'.format(pmd_codeviolations_len))

Number of violations caused by programmer 3958153


In [18]:
print('Difference {}'.format(pmd_codeviolations_len - pmd_codeviolations_pgmr_len))

Difference 3227054


In [19]:
single_class_df = df[df['class'].str.contains('17170546_17166918_4765_1', regex=False)]

In [20]:
single_class_df.shape[0].compute()

4

In [21]:
single_class_df = single_class_df.compute()

In [22]:
# Check out the one lines
for index, row in single_class_df.iterrows():
    print('<<<@@@@@@@@@@@ ID:{} @@@@@@@@@@@>>>'.format(index))
    print(row['rule'])
    print(row['text'])

<<<@@@@@@@@@@@ ID:7187 @@@@@@@@@@@>>>
UnnecessaryImport
Unnecessary import from the current package 'MyMain'
<<<@@@@@@@@@@@ ID:7189 @@@@@@@@@@@>>>
LocalVariableCouldBeFinal
Local variable 't' could be declared final
<<<@@@@@@@@@@@ ID:7190 @@@@@@@@@@@>>>
DoNotUseThreads
To be compliant to J2EE, a webapp should not use any thread.
<<<@@@@@@@@@@@ ID:7191 @@@@@@@@@@@>>>
UnnecessaryFullyQualifiedName
Unnecessary use of fully qualified name 'MyMain.workers' due to existing import 'MyMain'


In [23]:
nn

NameError: name 'nn' is not defined

#### Count the unique classes

In [None]:
#count the unique classes
pmd_codeviolations_df.groupby('class').count().shape[0].compute()

In [None]:
grp_df = pmd_codeviolations_df.groupby(['class', 'rule'])

In [None]:
grp_df.first()

In [None]:
grp_df.first().compute()

#### Get the unique classes dataframe

In [None]:
# get the unique classes
class_df = pmd_codeviolations_df['class'].drop_duplicates().compute()

In [None]:
# This also counts the unique classes
class_df.shape[0]

## Group based on class and rule

In [None]:
grp_df = pmd_codeviolations_df.groupby(['class', 'rule', ])

In [None]:
grp_df.first()

In [None]:
grp_df.first().compute()