In [1]:
import time
from datetime import timedelta

import html
import re

import os
import ntpath

import subprocess as sp

import numpy as np
import pandas as pd
import xml.etree.ElementTree as et

import dask
import dask.dataframe as dd
import dask.bag as bd
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

## Enter full path to CSV file

In [2]:
## Enter full path to CSV file

common_path = '../../../../stackexchange_v2/workspace/input'
#common_path = '../SO_data_dump'

## Enter Relative Path to csv file
relative_path = 'javarawcodes_csv/JavaRawCodes*.csv'

filepath_javaanswers = '{}/{}'.format(common_path, relative_path)

## Read Java Raw Codes csv files 

In [3]:
ddf_javarawcodes = dd.read_csv(filepath_javaanswers, engine='python', error_bad_lines=False, warn_bad_lines=False, dtype=object)

## Start a Dask cluster using SLURM jobs as workers

In [4]:
#http://jobqueue.dask.org/en/latest/generated/dask_jobqueue.SLURMCluster.html
dask.config.set(
    {
        "distributed.worker.memory.target": False,  # avoid spilling to disk
        "distributed.worker.memory.spill": False,  # avoid spilling to disk
    }
)
cluster = SLURMCluster(
    cores=2, #cores=10, # we set each job to have 1 Worker, each using 10 cores (threads) and 8 GB of memory
    processes=2, #processes=2,
    memory="16GiB",
    walltime="1-30:30",# walltime="0-00:50",
    log_directory="../dask/logs",  # folder for SLURM logs for each worker
    local_directory="../dask",  # folder for workers data
)

Spawn between 20 to 100 workers and connect a client to be able use them.

In [5]:
#cluster.scale(n=20) # ask for 20 jobs or workers
# This also works with adaptive clusters. This automatically launches and kill workers based on load.
# we tell our cluster to autoscale between 10 and 20 workers depending on the load
cluster.adapt(minimum_jobs=50, maximum_jobs=576)
#cluster.adapt(maximum_jobs=20)
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://192.168.94.140:34689  Dashboard: http://192.168.94.140:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [6]:
# Keep a copy
ddf_javarawcodes_temp = ddf_javarawcodes.copy()

## Numbere of Posts Containing Code Tags

In [7]:
# Check the length of the dataframe WITHOUT dropped records
#javarawcodes_len = len(ddf_javarawcodes.index)
#javarawcodes_len

## Drop all row with any column having a NaN value

In [8]:
# Drop all row with any column having a NaN value
#ddf_javarawcodes = ddf_javarawcodes.dropna()
ddf_javarawcodes = ddf_javarawcodes[~ddf_javarawcodes.Code.isna()] # Drop rows that have NaN in the Code column

In [9]:
# Get all the dropped rows with NA
javarawcodes_na_df = ddf_javarawcodes_temp[~ddf_javarawcodes_temp.Idx.isin(ddf_javarawcodes.Idx.compute())]

In [10]:
# Number of dropped records
javarawcodes_na_len = javarawcodes_na_df.index.shape[0].compute()
javarawcodes_na_len

961

In [11]:
# Number of records left
javarawcodes_notna_len = ddf_javarawcodes.index.shape[0].compute()
javarawcodes_notna_len

7193410

## Remove Codes that have any form of html or xml tags

#### Convert all the unicode strings into actual symbol

In [12]:
#https://stackoverflow.com/questions/60088353/convert-html-characters-to-strings-in-pandas-dataframe
ddf_javarawcodes = ddf_javarawcodes.applymap(html.unescape)

In [13]:
# keep a copy
ddf_javarawcodes_temp = ddf_javarawcodes.copy()

#### Remove code with any form of HTML/XML

In [14]:
#Replace all the single line html tags <.../> with empty string
#javarawcodes_html_df1 = ddf_javarawcodes[ddf_javarawcodes.Code.str.contains('^(//.*|\'|\n|\r|\s|\r\n)*<(.|\n|\r\n)*?>(.|\n|\r\n)*?<\/(.|\n|\r\n)*?(>|\'|\n|\r\n)$', regex=True)]
#ddf_javarawcodes_htmlxml = ddf_javarawcodes[ddf_javarawcodes.Code.str.contains('<.*>(.|\n|\r\n)*?<\/.*>', regex=True)] # this gets stuck at some point
ddf_javarawcodes_htmlxml = ddf_javarawcodes[ddf_javarawcodes.Code.str.contains('<.*>(.|\n|\r\n)<\/.*>*?', regex=True)]


  return func(self, *args, **kwargs)


In [15]:
# Number of records that dropped
javarawcodes_htmlxml_len = ddf_javarawcodes_htmlxml.index.shape[0].compute()
javarawcodes_htmlxml_len

77132

In [16]:
# Get all the records that DO NOT contains html tags regex
ddf_javarawcodes = ddf_javarawcodes[~ddf_javarawcodes.Idx.isin(ddf_javarawcodes_htmlxml.Idx.compute())]

In [17]:
# Number of records left
javarawcodes_len = ddf_javarawcodes.index.shape[0].compute()
javarawcodes_len

6948156

In [18]:
#Verify lengths
#javarawcodes_len + javarawcodes_htmlxml_len

## Drop java code with one line of code

In [19]:
# Keep a copy
ddf_javarawcodes_temp = ddf_javarawcodes.copy()

In [20]:
# Get df contains more than one line of java code
ddf_javarawcodes = ddf_javarawcodes[ddf_javarawcodes.Code.str.contains('\n')]

In [21]:
# Number of rows that contains more than one line of java code
javarawcodes_len = ddf_javarawcodes.index.shape[0].compute()
javarawcodes_len

2548181

In [22]:
# Get the dataframe that contains one line of java code
javarawcodes_oneline_df = ddf_javarawcodes_temp[~ddf_javarawcodes_temp.Idx.isin(ddf_javarawcodes.Idx.compute())]

In [23]:
# Number of rows that contains one line of java code
javarawcodes_oneline_len = javarawcodes_oneline_df.index.shape[0].compute()
javarawcodes_oneline_len

1603264

In [24]:
# Verify that the lengths are equal to the javarawcodes_html_len1
javarawcodes_len + javarawcodes_oneline_len

4151445

## Get complete java code snippet

In [25]:
# Get the rows that contain import, package or class
# using regex to search for text that contain import, package or class
ddf_complete_javacodes = ddf_javarawcodes[(ddf_javarawcodes['Code'].str.contains('^import\s+\w+(\.+\w+)*', case=False, regex=True))|
         (ddf_javarawcodes['Code'].str.contains('^package\s+\w+(\.+\w+)*', case=False, regex=True))|
         (ddf_javarawcodes['Code'].str.contains('(class|interface)(\s+\w+)(\s+\w+)*(\s*{)', case=False, regex=True))]

  return func(self, *args, **kwargs)


In [26]:
# Check lenght of complete java code snippet
complete_javacodes_len = ddf_complete_javacodes.index.shape[0].compute()
complete_javacodes_len

376041

## Get incomplete java code snippet

In [27]:
# Get all rows not contained in ddf_complete_javacodes
ddf_incomplete_javacodes = ddf_javarawcodes[~ddf_javarawcodes.Idx.isin(ddf_complete_javacodes.Idx.compute())]

In [28]:
# Check lenght of incomplete java code snippet
ddf_incomplete_javacodes.index.shape[0].compute()

1973457

In [29]:
# Total lenght of both complete and incomplete equals javarawcodes_len
ddf_complete_javacodes.index.shape[0].compute() + ddf_incomplete_javacodes.index.shape[0].compute()

2349498

## Furthermore drop codes that starts with $, WARNING:, >java, java, >javac, javac or contain some build, src directories

In [30]:
# Furthermore drop codes that starts with $, WARNING:, >java, java, >javac, javac, <xml and html tags> and directories

ddf_non_codes = ddf_incomplete_javacodes[
    (ddf_incomplete_javacodes.Code.str.contains('^\$.*', case=False, regex=True))|
    (ddf_incomplete_javacodes.Code.str.contains('^WARNING:(\s*\w*)*', case=False, regex=True))|
    (ddf_incomplete_javacodes.Code.str.contains('^($|>|java|javac|jar|javacc)(\s*\w*\.*)*', case=False, regex=True))|
    (ddf_incomplete_javacodes.Code.str.contains('(\\n)*\s*\.+\/+.*\s*\.*\/*(build|src)', case=False, regex=True))
]

  return func(self, *args, **kwargs)


In [31]:
# Check lenght of the non codes java code snippet 
non_codes_len = ddf_non_codes.index.shape[0].compute()
non_codes_len

32612

In [32]:
# get all the dropped rows that starts with $, WARNING:, >java, java, >javac, javac or directory
ddf_incomplete_javacodes = ddf_incomplete_javacodes[~ddf_incomplete_javacodes.Idx.isin(ddf_non_codes.Idx.compute())]

In [33]:
# Check lenght of incomplete java code snippet after html tags are dropped
incomplete_javacodes_len = ddf_incomplete_javacodes.index.shape[0].compute()
incomplete_javacodes_len

1922305

In [34]:
# Verify that the length is equals to combination of complete_javarawcodes_len and incomplete_javarawcodes_len above
incomplete_javacodes_len + non_codes_len

1954917

### Make a raw complete codes folder in that directory

In [35]:
## Make a folder in that directory
folder = '{}/javarawcompletecodesnippets_csv'.format(common_path)
# output: path/to/Post.csv => path/to
mkdir_cmd = 'mkdir {}'.format(folder)
cmd = sp.run(
    mkdir_cmd, # command
    capture_output=True,
    text=True,
    shell=True
)

In [36]:
print(folder)

../../../../stackexchange_v2/workspace/input/javarawcompletecodesnippets_csv


### Save Java raw complete codes csv files in that directory

In [37]:
## Save files in that directory
filename = 'JavaRawCompleteCodeSnippets'
file = '{}/{}*.csv'.format(folder, filename)
_ = ddf_complete_javacodes.to_csv(file, sep=',', index=False)

### Make a raw incomplete codes folder in that directory

In [38]:
## Make a folder in that directory
folder = '{}/javarawincompletecodesnippets_csv'.format(common_path)
# output: path/to/Post.csv => path/to
mkdir_cmd = 'mkdir {}'.format(folder)
cmd = sp.run(
    mkdir_cmd, # command
    capture_output=True,
    text=True,
    shell=True
)

In [39]:
print(folder)

../../../../stackexchange_v2/workspace/input/javarawincompletecodesnippets_csv


### Save Java raw incomplete codes csv files in that directory

In [40]:
## Save files in that directory
filename = 'JavaRawIncompleteCodeSnippets'
file = '{}/{}*.csv'.format(folder, filename)
_ = ddf_incomplete_javacodes.to_csv(file, sep=',', index=False)

### Make a code that contains HTML/XML folder in that directory 

In [41]:
## Make a folder in that directory
folder = '{}/javahtmlxmltags_csv'.format(common_path)
# output: path/to/Post.csv => path/to
mkdir_cmd = 'mkdir {}'.format(folder)
cmd = sp.run(
    mkdir_cmd, # command
    capture_output=True,
    text=True,
    shell=True
)

### Save Java codes that contains HTML/XML csv files in that directory

In [42]:
## Save files in that directory
filename = 'JavaCodeHtmlXmlTags'
file = '{}/{}*.csv'.format(folder, filename)
_ = ddf_javarawcodes_htmlxml.to_csv(file, sep=',', index=False)

### Make a noncode folder in that directory 

In [43]:
## Make a folder in that directory
folder = '{}/noncode_csv'.format(common_path)
# output: path/to/Post.csv => path/to
mkdir_cmd = 'mkdir {}'.format(folder)
cmd = sp.run(
    mkdir_cmd, # command
    capture_output=True,
    text=True,
    shell=True
)

In [44]:
print(folder)

../../../../stackexchange_v2/workspace/input/noncode_csv


### Save non codes csv files in that directory

In [45]:
## Save files in that directory
filename = 'NonCodes'
file = '{}/{}*.csv'.format(folder, filename)
_ = ddf_non_codes.to_csv(file, sep=',', index=False)

## Replace all the interface and class names to all have a common name (e.g. Code) in the ddf_complete_javacodes

In [46]:
# Replace all the class with the class to all have a common name, say Code
ddf_complete_javacodes = ddf_complete_javacodes.replace(to_replace = 'class\s+\w+', value = 'class Code', regex=True)

In [47]:
# Also Replace interface with the interface Code
ddf_complete_javacodes = ddf_complete_javacodes.replace(to_replace = 'interface\s+\w+', value = 'interface Code', regex=True)

## Encapsulate incomplete written java program with public Code { ... }

In [48]:
# Copy the code to Code2
#ddf_incomplete_javacodes['Code2'] = ddf_incomplete_javacodes['Code']

In [49]:
# Encapsulate incomplete written java program with public Code {  }
ddf_incomplete_javacodes['Code'] = 'public class Code {\n'+ ddf_incomplete_javacodes['Code'].astype(str) +'\n}'

In [50]:
ddf_complete_javacodes.columns

Index(['Idx', 'match', 'Code'], dtype='object')

### Make a complete codes folder in that directory

In [51]:
## Make a folder in that directory
folder = '{}/javacompletecodesnippets_csv'.format(common_path)
# output: path/to/Post.csv => path/to
mkdir_cmd = 'mkdir {}'.format(folder)
cmd = sp.run(
    mkdir_cmd, # command
    capture_output=True,
    text=True,
    shell=True
)

In [52]:
print(folder)

../../../../stackexchange_v2/workspace/input/javacompletecodesnippets_csv


### Save Java complete codes csv files in that directory

In [53]:
## Save files in that directory
filename = 'JavaCompleteCodeSnippets'
file = '{}/{}*.csv'.format(folder, filename)
_ = ddf_complete_javacodes.to_csv(file, sep=',', index=False)

In [54]:
ddf_incomplete_javacodes.columns

Index(['Idx', 'match', 'Code'], dtype='object')

### Make a incomplete codes folder in that directory

In [55]:
## Make a folder in that directory
folder = '{}/javaincompletecodesnippets_csv'.format(common_path)
# output: path/to/Post.csv => path/to
mkdir_cmd = 'mkdir {}'.format(folder)
cmd = sp.run(
    mkdir_cmd, # command
    capture_output=True,
    text=True,
    shell=True
)

In [56]:
print(folder)

../../../../stackexchange_v2/workspace/input/javaincompletecodesnippets_csv


### Save Java incomplete codes csv files in that directory

In [57]:
## Save files in that directory
filename = 'JavaIncompleteCodeSnippets'
file = '{}/{}*.csv'.format(folder, filename)
_ = ddf_incomplete_javacodes.to_csv(file, sep=',', index=False)

## Merge the data frame containing the completed java codes with the data frame containing the incomplete code (encapsulated with public Code {})

In [58]:
# merge complete and incomplete into one dataframe
df = dd.concat([ddf_complete_javacodes, ddf_incomplete_javacodes])

In [59]:
df.index.shape[0].compute()

2298346

In [60]:
# Verify that the combination of the lengths is same as merged len
complete_javacodes_len + incomplete_javacodes_len

2298346

In [61]:
df.columns

Index(['Idx', 'match', 'Code'], dtype='object')

## Replace all cases where three dots ... appeared at the begining of a line within the code with comment i.e. //...

In [62]:
df['Code'] = df.Code.replace(to_replace ='(\\n)+\s*\.{2,}', value = '\n\n//...\n\n', regex=True)
df['Code'] = df.Code.replace(to_replace ='(\\n)+\s*\.{2,}', value = '\n\n//...\n\n', regex=True)
df['Code'] = df.Code.replace(to_replace ='(\\n\.)+', value = '\n\n//...\n\n', regex=True)
df['Code'] = df.Code.replace(to_replace ='\{(\\n)+\s*\.{2,}', value = '{\n\n//...\n\n', regex=True)
df['Code'] = df.Code.replace(to_replace ='\[(\\n)+\s*\.{2,}', value = '[\n\n//...\n\n', regex=True)
df['Code'] = df.Code.replace(to_replace ='\{\s*\.{2,}', value = '{\n\n//...\n\n', regex=True)
df['Code'] = df.Code.replace(to_replace ='\[\s*\.{2,}', value = '[\n\n//...\n\n', regex=True)

#### To verify if the Regex works

In [63]:
#@@ This is supposed tp capture anywhere 2 or more dots occured anywhere in the code 
doted_df1 = df[df.Code.str.contains('((\.\s){2,}|\.{2,})', case=False, regex=True)] #works

  return func(self, *args, **kwargs)


In [64]:
doted_df1.index.shape[0].compute()

223383

In [65]:
#this is ment to capture anywhere we have three dots without comment //
doted_df2 = doted_df1[doted_df1.Code.str.contains('((\[|{)(\\n)*\s*|(\\n)+\s*)\.{2,}|(\\n\.)+', case=False, regex=True)]

  return func(self, *args, **kwargs)


In [66]:
doted_df2.index.shape[0].compute()

29

In [67]:
doted_df3=doted_df1[~doted_df1.Idx.isin(doted_df2.Idx.compute())]

In [68]:
doted_df3.index.shape[0].compute()

223349

## Uniquely identify each class name by appending the index_questionid to the end of the class name

In [69]:
# To uniquely identify each class, append the index number of each row to Classname

# e.g. 
# Code_44671882_44671797_3688_0 
# Code_44671882_44671797_3688_2
# Code_44671809_32500182_1042_0
# ...

# Explanation:
# e.g. Code0_44671882_44671797_3688_0
# CodeUniqueIndex_PostID_ParentID_GroupID_MatchCaseNumber
# Code0_44671882_44671797_3688_1 did not appear becous it could have been removed while preprocessing

# Importing re package for using regular expressions 
import re 

#def appendIndexToClassNames(index, questionIdx, match, javacode): 
def appendIndexToClassNames(questionIdx, match, javacode): 
    
    # Search for class ClassName 
    # i.e. class then any characters repeated any number of times 
    if re.search('(class|interface)\s+\w+', javacode): 
  
        # Extract the position of end of pattern 
        #start = re.search('(class|interface)\s+\w+', javacode).start()
        end = re.search('(class|interface)\s+\w+', javacode).end()
  
        # return the cleaned name 
        # return javacode[ : pos]+str(javacode.index)+javacode[pos:]
        return javacode[ : end]+'_'+str(questionIdx)+'_'+str(match)+javacode[end: ]
  
    else: 
        # if clean up needed return the same name 
        return javacode 
          
 

In [70]:
# Updated the Java Code answers column  
df['Code'] = df.apply(
         #lambda row: appendIndexToClassNames(row.name, row['Idx'], row['match'], row['Code']),axis=1)
    lambda row: appendIndexToClassNames(row['Idx'], row['match'], row['Code']),
         axis=1)
# Print the updated dataframe 
#print(df)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'object'))



### Make a folder in that directory

In [71]:
## Make a folder in that directory
folder = '{}/codesnippets_csv'.format(common_path)
# output: path/to/Post.csv => path/to
mkdir_cmd = 'mkdir {}'.format(folder)
cmd = sp.run(
    mkdir_cmd, # command
    capture_output=True,
    text=True,
    shell=True
)

In [72]:
print(folder)

../../../../stackexchange_v2/workspace/input/codesnippets_csv


### Save files in that directory

In [73]:
## Save files in that directory
filename = 'JavaCodeSnippets'
file = '{}/{}*.csv'.format(folder, filename)
_ = df.to_csv(file, sep=',', index=False)