In [2]:
import re
import Mapping
import DataFlowApproach
import Config

import math


def getFrequencyFromList(lst):
    dict_freq = {}
    for token in lst:
        if token in dict_freq.keys():
            dict_freq[token] = dict_freq[token] + 1
        else:
            dict_freq[token] = 1

    return dict_freq


def getMostFrequent(dict_freq, threshold=1):
    lst_token_freq = sorted(
        dict_freq.items(), key=lambda kv: kv[1], reverse=True)
    lst_token = []
    for idx in range(math.ceil(len(lst_token_freq)*threshold)):
        if idx >= len(lst_token_freq):
            break
        lst_token.append(lst_token_freq[idx][0])
    return lst_token


def detectClone(codeBlocks):
    for codeBlockId in codeBlocks:
        codeBlock = codeBlocks[codeBlockId]
        code = codeBlock['Code']
        dict_tokens, dict_variables, dict_methods = getAllTokens(code)

        # print("dict_v ", dict_variables)
        # print("dict_mc", dict_methods)
        # print("dict_t", dict_tokens)
        variables_lst = getMostFrequent(
            dict_variables, Config.variableAndMethodsThreshold)
        methods_lst = getMostFrequent(
            dict_methods, Config.variableAndMethodsThreshold)

        # print("identifiers ", variables_lst)
        # print("methods ", methods_lst)
        variable_scope, method_calls_scope = DataFlowApproach.dataFlowGenerator(
            code, variables_lst, methods_lst, [codeBlock['FileInfo'], codeBlock['Start'], codeBlock['End']])

        # print("VC", variable_scope)
        # print("MCS ", method_calls_scope)
        codeBlock.update({"Tokens": dict_tokens})
        codeBlock.update({"Variables_Scope": variable_scope})
        codeBlock.update({"Method_Calls_Scope": method_calls_scope})
    
    codeclonelines = 0

    for codeBlockId in codeBlocks:
        codeBlock = codeBlocks[codeBlockId]

        tokens = codeBlock["Tokens"]
        variable_scope = codeBlock["Variables_Scope"]
        method_calls_scope = codeBlock["Method_Calls_Scope"]

        codeCloneIds = []
        
        for codeCandidateId in codeBlocks:
            if codeCandidateId == codeBlockId:
                continue

            simTokens = similarity(
                tokens, codeBlocks[codeCandidateId]["Tokens"])
            if simTokens >= Config.tokenSimilarityThreshold:
                # We will check the control flow of variables here
                codeCandidateBlock = codeBlocks[codeCandidateId]
                candidate_variable_scope = codeCandidateBlock["Variables_Scope"]
                candidate_method_calls_scope = codeCandidateBlock["Method_Calls_Scope"]
                # print("Variables Scope", variable_scope)
                # print("Methods Calls Scope", method_calls_scope)
                # print("Candidate vC", candidate_variable_scope)
                # print("Can MC", candidate_method_calls_scope)
                variableSimilarityByDataFlow, methodCallSimilarityByDataFlow = DataFlowApproach.getSimilarity(
                    variable_scope, method_calls_scope, candidate_variable_scope, candidate_method_calls_scope,
                    [codeBlock['FileInfo'], codeBlock['Start'], codeBlock['End'],
                     codeCandidateBlock['FileInfo'], codeCandidateBlock['Start'], codeCandidateBlock['End']])
                if variableSimilarityByDataFlow >= Config.similarityDataFlowThreshold and methodCallSimilarityByDataFlow >= Config.similarityDataFlowThreshold:
                    codeclonelines = codeclonelines + len(codeCandidateBlock['Code'])
                    codeCloneIds.append(
                        {"Similarity": [simTokens, variableSimilarityByDataFlow, methodCallSimilarityByDataFlow], "codeCandidateId": codeCandidateId})
        
        codeBlock.update({"CodeClones": codeCloneIds})
    
    return codeBlocks,codeclonelines


def getAllTokens(code):
    list_methods = []
    list_tokens = []
    list_variables = []
    for line in code:
        line = re.sub(r"(\".*?\"|\'.*?\')", " STRING_LITERAL ", line)
        regexPattern = '|'.join(map(re.escape, Mapping.delimiters))
        list_line = re.sub('(?<=\W|\w)(' + regexPattern + ')',
                           r' \1 ', line).split()
        list_line = [unit.strip() for unit in list_line if unit.strip() != ""]
        # print(list_line)

        for idx in range(len(list_line)):
            unit = list_line[idx].strip()
            unit = re.sub(r"^[+-]?((\d*(\.\d*)?)|(\.\d*))$",
                          "INTEGER_LITERAL", unit)
            if unit in Mapping.symbols:
                continue
            elif unit in Mapping.keywords.keys():
                list_tokens.append(Mapping.keywords[unit])
            else:
                if idx + 1 < len(list_line) and list_line[idx + 1].strip() == '(':

                    list_methodName = unit.split(".")

                    list_methods.append(list_methodName[-1])

                    list_tokens.append(list_methodName[-1])
                    # list_tokens.append("TOKEN_METHOD")

                else:
                    list_variableName = unit.split('.')
                    # print(list_variableName)

                    list_variables.append(list_variableName[-1])
                    list_tokens.append("TOKEN_VARIABLE")

    dict_tokens =getFrequencyFromList(list_tokens)
    dict_variables =getFrequencyFromList(list_variables)
    dict_methods = getFrequencyFromList(list_methods)

    return dict_tokens, dict_variables, dict_methods


def similarity(Tokens1, Tokens2):
    """
    input : two list of code
    output : similarity between two list of tokens(decimal between 0 and 1)
    """
    tokensIntersect = 0
    tokens1 = 0
    tokens2 = 0
    tokensUnion = 0
    Tokens1Keys = Tokens1.keys()
    Tokens2Keys = Tokens2.keys()
    for key in Tokens1Keys:
        if key in Tokens2Keys:
            tokensIntersect += min(Tokens1[key], Tokens2[key])
    for key in Tokens1Keys:
        tokens1 += Tokens1[key]
    for key in Tokens2Keys:
        tokens2 += Tokens2[key]
    return (tokensIntersect)/(tokens1 + tokens2 - tokensIntersect)


In [3]:
dirPath = "/Users/vivekgoud/Documents/GitHub/Test_project_Codeclonetracer"
#dirPath = "F:\8th-Sem-Project\src\examples\Single"
#outputPath = "F:\8th-Sem-Project\src\CodeCloneDetection\output.txt"
#outputCSVPath = "F:\8th-Sem-Project\src\CodeCloneDetection\clonesDetected.csv"
# This will be used as level for output into file
# 0 means everything
# 1 means current block's code and only clone blocks info
# 2 means only current block's and clone block's info
outputLevel = 2

# Minimum length of block to consider
minimumLengthBlock = 4

# Threshhold for considering as code clones
# Threshhold = 1 for type 2 clones
tokenSimilarityThreshold = 0.75

# Threshold for similarity measure by data flow approach
similarityDataFlowThreshold = 0.65

# Threshold for considering most frequent variables and methods
variableAndMethodsThreshold = 0.65

# Threshold while comparing dataflow of two variables and methods
dataFlowSimilaritythreshold = 0.65

# Block level can be 0 = (file level) or 1 = (method level)
granularity = 'method_level'



In [4]:
import itertools
import logging
import os
#import GetFunctions
import re
import sys
import traceback
import CloneDetector
import javalang

import Config
import pandas as pd
global found_parent

def extractMethodsAllFiles(listOfFiles):
   
    allFilesMethodsBlocks = {}
    blocksSoFar = 0
    linesofcode = 0
    codeBlocks= {}
    
    for filePath in listOfFiles:
        file = open(filePath, 'r', encoding='utf-8')
        originalCode = file.readlines()
        file.close()
        if Config.granularity == 'method_level':
      
            linesofcode = linesofcode + len(originalCode)
            codeBlocks = methodLevelBlocks(originalCode)
            
        else:
            linesofcode = linesofcode + len(originalCode)
            codeBlocks = fileLevelBlocks(originalCode)
        if len(codeBlocks) == 0:
            continue
        for codeBlock in codeBlocks:
            if len(codeBlock) == 0:
                continue
            codeBlock.update({"FileInfo": filePath})
            codeBlock.update({"nloc": len(codeBlock)})
            blocksSoFar += 1
            allFilesMethodsBlocks["CodeBlock" + str(blocksSoFar)] = codeBlock
    print("detecting clones")
    granularity = Config.granularity
    codeBlocks,codeclonelines=CloneDetector.detectClone(allFilesMethodsBlocks)
   
    previous_file_name = '/Users/vivekgoud/Downloads/thesis/Tracking_dataset.csv'
    current_dataset=dataset_creation(codeBlocks)
    
    previous_dataset = pd.DataFrame()
    previous_clones = pd.DataFrame(columns=['codeBlockId','codeBlock_start','codeBlock_end','codeBlock_fileinfo','codeblock_Code','codeCloneBlockId',
                               'codeCloneBlock_Fileinfo','Similarity_Tokens','Similarity_Variable_Flow',
                               'Similarity_MethodCall_Flow','commitinfo','nloc','Revision'])
    if os.path.isfile(previous_file_name): #previous_file_name.exists(): 
        previous_dataset =  pd.read_csv(previous_file_name, index_col=0)
        revision = previous_dataset.Revision.unique()
        print("Revision",revision[0])
        previous_clones = previous_dataset[~previous_dataset.codeBlock_fileinfo.isin(current_dataset.codeBlock_fileinfo)]
        frames = [current_dataset,previous_clones]
        current_dataset=pd.concat([current_dataset,previous_clones])
        current_dataset= current_dataset.loc[current_dataset.astype(str).drop_duplicates().index]
        current_dataset['Revision'] = revision[0] + 1
    else:
        print("First version, no cloning result exists")
        current_dataset['Revision'] = 1

    current_dataset = current_dataset.convert_dtypes()
    all_columns = list(current_dataset) # Creates list of all column headers
    current_dataset[all_columns] = current_dataset[all_columns].astype(str)
    current_dataset= current_dataset.loc[current_dataset.astype(str).drop_duplicates().index]
    current_dataset.to_csv('/Users/vivekgoud/Downloads/thesis/Tracking_dataset.csv')
        #current_dataset.to_sql('rxjava', con= engine, if_exists='append', index=False)
        #pd.read_sql('select count(*) from rxjava', conn=engine)
        #current_dataset.to_sql('training_onlinebookstore', con=engine, if_exists='append', index=False)"""
   
    return current_dataset,linesofcode,codeclonelines

def dataset_creation(codeBlocks):

    df = pd.DataFrame(columns=['codeBlockId','codeBlock_start','codeBlock_end','codeBlock_fileinfo','codeblock_Code','codeCloneBlockId',
                               'codeCloneBlock_Fileinfo','Similarity_Tokens','Similarity_Variable_Flow',
                             'Similarity_MethodCall_Flow','commitinfo','nloc'])

    output=[]
    for codeBlockId in codeBlocks:
          codeBlock = codeBlocks[codeBlockId]
          for codeCloneBlockData in codeBlock["CodeClones"]:
            codeCloneBlockId = codeCloneBlockData["codeCandidateId"]
            codeCloneBlock = codeBlocks[codeCloneBlockId]
            codeCloneSimilarity = codeCloneBlockData["Similarity"]
            output.append([codeBlockId,str(codeBlock["Start"]),str(codeBlock["End"]),codeBlock["FileInfo"],codeBlock["Code"],codeCloneBlockData["codeCandidateId"],
                       codeCloneBlock["FileInfo"],str(codeCloneSimilarity[0]),str(codeCloneSimilarity[1]),
                      str(codeCloneSimilarity[2]),str(codeBlock["nloc"])
                      ])            
    for index,x in enumerate(output):
        a_row=pd.Series([x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7],x[8],x[9],x[10],x[11]],
          index=['codeBlockId','codeBlock_start','codeBlock_end','codeBlock_fileinfo','codeblock_Code','codeCloneBlockId',
                               'codeCloneBlock_Fileinfo','Similarity_Tokens','Similarity_Variable_Flow',
                             'Similarity_MethodCall_Flow','commitinfo','nloc'])
        df=pd.concat([df,a_row])
        #row_df = pd.DataFrame([a_row])
        #df=df.append(row_df) 

    return df

def fileLevelBlocks(originalCode):
    """
    input : originalCode
    output : blocks using file level
    """

    allCodeBlocks = []
    commentsRemovedCode = removeCommentsFromCode(originalCode)
    startLine = 1
    endLine = len(commentsRemovedCode)
    allCodeBlocks.append(
        {"Start": startLine, "End": endLine, "Code": originalCode})
    return allCodeBlocks


def methodLevelBlocks(originalCode):
    """
    input : originalCode
    output : blocks using method level
    """
    commentsRemovedCode = removeCommentsFromCode(originalCode)
    codeInSingleLine = "\n".join(commentsRemovedCode)

    output = method_extractor(codeInSingleLine)

    allCodeBlocks = []
    if output[0] == None:
        return allCodeBlocks
    for i in range(len(output[0])):
        if abs(output[0][i][1] - output[0][i][0]) < Config.minimumLengthBlock - 1:
            continue
        allCodeBlocks.append(
            {"Start": output[0][i][0], "End": output[0][i][1], "Code": originalCode})# output[1][i].split('\n')})
    
    return allCodeBlocks
# get all lines of code before detection 
# get all clone code lines
# send code blocks to dataset creation

def removeCommentsFromCode(originalCode):
    """
    input : original Code
    output : code without comments 
    """

    DEFAULT = 1
    ESCAPE = 2
    STRING = 3
    ONE_LINE_COMMENT = 4
    MULTI_LINE_COMMENT = 5

    mode = DEFAULT
    strippedCode = []
    for line in originalCode:
        strippedLine = ""
        idx = 0
        while idx < len(line):
            subString = line[idx: min(idx + 2, len(line))]
            c = line[idx]
            if mode == DEFAULT:
                mode = MULTI_LINE_COMMENT if subString == "/*" else ONE_LINE_COMMENT if subString == "//" else STRING if c == '\"' else DEFAULT
            elif mode == STRING:
                mode = DEFAULT if c == '\"' else ESCAPE if c == '\\' else STRING
            elif mode == ESCAPE:
                mode = STRING
            elif mode == ONE_LINE_COMMENT:
                mode = DEFAULT if c == '\n' else ONE_LINE_COMMENT
                idx += 1
                continue
            elif mode == MULTI_LINE_COMMENT:
                mode = DEFAULT if subString == "*/" else MULTI_LINE_COMMENT
                idx += 2 if mode == DEFAULT else 1
                continue
            strippedLine += c if mode < 4 else ""
            idx += 1
        if len(strippedLine) > 0 and strippedLine[-1] == '\n':
            strippedLine = strippedLine[:-1]
        # strippedLine = re.sub('\t| +', ' ', strippedLine)
        strippedCode.append(strippedLine)
    return strippedCode

try:
    from configparser import ConfigParser
except ImportError:
    from ConfigParser import ConfigParser  # ver. < 3.0


re_string = re.escape("\"") + '.*?' + re.escape("\"")


def getFunctions(filestring, comment_inline_pattern=".*?$"):

    method_string = []
    method_pos = []
    method_name = []

    global found_parent
    found_parent = []

    tree = None

    try:
        tree = javalang.parse.parse(filestring)
        package = tree.package
        if package is None:
            package = 'DefaultPackage'
        else:
            package = package.name
            # print package,'####'
    except Exception as e:
        # logging.warning('Traceback:' + traceback.print_exc())
        return (None, None, [])

    file_string_split = filestring.split('\n')
    # print(file_string_split)
    nodes = itertools.chain(tree.filter(
        javalang.tree.ConstructorDeclaration), tree.filter(javalang.tree.MethodDeclaration))

    for path, node in nodes:
        # print(type(node))
        # print '---------------------------------------'
        name = '.'+node.name
        for i, var in enumerate(reversed(path)):
            # print var, i, len(path)-3
            if isinstance(var, javalang.tree.ClassDeclaration):
                # print 'One Up:',var,var.name
                if len(path)-3 == i:  # Top most
                    name = '.'+var.name+check_repetition(var, var.name)+name
                else:
                    name = '$'+var.name+check_repetition(var, var.name)+name
            if isinstance(var, javalang.tree.ClassCreator):
                # print 'One Up:',var,var.type.name
                name = '$'+var.type.name + \
                    check_repetition(var, var.type.name)+name
            if isinstance(var, javalang.tree.InterfaceDeclaration):
                # print 'One Up:',var,var.name
                name = '$'+var.name+check_repetition(var, var.name)+name
        # print i,var,len(path)
        # print path
        # while len(path) != 0:
        #  print path[:-1][-1]
        args = []
        for t in node.parameters:
            dims = []
            if len(t.type.dimensions) > 0:
                for e in t.type.dimensions:
                    dims.append("[]")
            dims = "".join(dims)
            args.append(t.type.name+dims)
        args = ",".join(args)

        fqn = ("%s%s(%s)") % (package, name, args)
        # print "->",fqn

        (init_line, b) = node.position
        method_body = []
        closed = 0
        openned = 0

        # print '###################################################################################################'
        # print (init_line,b)
        # print 'INIT LINE -> ',file_string_split[init_line-1]
        # print '---------------------'

        for line in file_string_split[init_line-1:]:
            # if len(line) == 0:
            #     continue
            # print '+++++++++++++++++++++++++++++++++++++++++++++++++++'
            # print line
            # print comment_inline_pattern
            line_re = re.sub(comment_inline_pattern, '',
                             line, flags=re.MULTILINE)
            line_re = re.sub(re_string, '', line_re, flags=re.DOTALL)

            # print line
            # print '+++++++++++++++++++++++++++++++++++++++++++++++++++'

            closed += line_re.count('}')
            openned += line_re.count('{')
            if (closed - openned) == 0 and openned > 0:
                method_body.append(line)
                break
            else:
                method_body.append(line)

        # print '\n'.join(method_body)

        end_line = init_line + len(method_body) - 1
        method_body = '\n'.join(method_body)

        method_pos.append((init_line, end_line))
        method_string.append(method_body)

        method_name.append(fqn)

    if (len(method_pos) != len(method_string)):
        # logging.warning("File " + file_path + " cannot be parsed. (3)")
        return (None, None, method_name)
    else:
        # logging.warning("File " + file_path + " successfully parsed.")
        return (method_pos, method_string, method_name)


def check_repetition(node, name):
    before = -1
    i = 0
    for (obj, n, value) in found_parent:
        if obj is node:
            if value == -1:
                return ''
            else:
                return '_'+str(value)
        else:
            i += 1
        if n == name:
            before += 1
    found_parent.append((node, name, before))
    if before == -1:
        return ''
    else:
        return '_'+str(before)


def method_extractor(file):
    methodsInfo = []

    FORMAT = '[%(levelname)s] (%(threadName)s) %(message)s'
    # logging.basicConfig(level=logging.DEBUG, format=FORMAT)

    config = ConfigParser()

    # parse existing file
    #try:
     #   config.read(os.path.join(os.path.dirname(
      #      os.path.abspath(__file__)), 'config.ini'))
    #except IOError:
     #   print('ERROR - Config settings not found. Usage: $python this-script.py config-file.ini')
      #  sys.exit()
  
    separators = "; . [ ] ( ) ~ ! - + & * / % < > ^ | ? { } = # , \" \\ : $ ' ` @"
    comment_inline = "#"
    comment_inline_pattern = comment_inline + '.*?$'

    return getFunctions(file, comment_inline_pattern)

    # allFilesInFolder = GetFiles.getAllFilesUsingFolderPath(folderPath)

    # print(allFilesInFolder)


def getAllFilesUsingFolderPath(folderPath):
    allFilesInFolder = []
    fileCount = 0
    maxCount = 100
    for subdir, dirs, files in os.walk(folderPath):
        for fileName in files:
            fileCount += 1
            if fileName.split(".")[-1] != "java":
                continue
            fileFullPath = os.path.join(subdir, fileName)
            allFilesInFolder.append(fileFullPath)
            if fileCount > maxCount:
                break
    return allFilesInFolder

In [5]:
import Mapping

import re
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import Config

cf_mapping = {"while": "iteration",
              "for": "iteration",
              "do": "iteration",
              "if": "selection",
              "else": "selection",
              "else if": "selection",
              "try": "try",
              "catch": "catch",
              "finally": "finally"}

keywords = ["while", "for", "do", "if", "else", "try", "catch", "finally"]

operators_and_symbols = ['<', '>', '=', '==', '+', '-', '*', '/',
                         '>=', "<=", '{', '}', '(', ')', ',', ';', '||']


def stringMatching(str1, str2):
    # str1, str2 = "", ""

    # for ele in num1:
    #     str1 += ele
    # for ele in num2:
    #     str2 += ele

    similarity = fuzz.ratio(str1, str2)
    # print(str1, str2, similarity)
    return similarity


# def lcs(num1, num2, m, n):
#     dp = []
#     for _ in range(m+1):
#         dp.append([])
#         for __ in range(n+1):
#             dp[-1].append(0)

#     for i in range(m+1):
#         for j in range(n+1):
#             if(i == 0 or j == 0):
#                 dp[i][j] = 0

#             elif(num1[i-1] == num2[j-1]):
#                 dp[i][j] = dp[i-1][j-1] + 1

#             else:
#                 dp[i][j] = max(dp[i-1][j], dp[i][j-1])

#     return dp[m][n]

def checkForParenthesis(method_lines, lst_line, i):
    assert(len(method_lines) > 0)
    if '{' in lst_line:
        return method_lines

    else:
        k = i+1
        for next_line in method_lines[i+1:]:
            next_line = next_line.strip()
            #print(k, next_line)
            if(len(next_line) > 0):
                if '{' == next_line[0]:
                    return method_lines
                else:
                    #print(next_line, "++++++" + next_line[-1] + "+++++")
                    if ';' == next_line[-1]:
                        #print("========", next_line, method_lines[k])
                        method_lines[k] = method_lines[k] + " } "
                        method_lines[i] = method_lines[i] + " { "
                        return method_lines
            k += 1
    return method_lines


def parenthesisBalancer(method_lines):
    for i, line in enumerate(method_lines):
        # print(line)
        lst_line = line.split()

        for j, unit in enumerate(lst_line):
            if unit in keywords:
                #print(j, unit)
                method_lines = checkForParenthesis(
                    method_lines, lst_line, i)
                # print(method_lines)

    # for line in method_lines:
    #     print(line)
    return method_lines


def getSimilarity(m1_v_scope=[], m1_mc_scope=[], m2_v_scope=[], m2_mc_scope=[], clonesInfo=[]):
    #m1_v_scope = [["n", "1global 2iteration 1global"], ["temp",]]
    dataFlowSimilaritythreshold = 0.95
    clone_count_variables, total_count_variables = 0, max(
        len(m1_v_scope), len(m2_v_scope))
    clone_count_method_calls, total_count_method_calls = 0, max(
        len(m1_mc_scope), len(m2_mc_scope))

    comparison_len_variables = min(len(m1_v_scope), len(m2_v_scope))
    comparison_len_method_calls = min(len(m1_mc_scope), len(m2_mc_scope))

    i = 0
    j = 0
    while i < len(m1_v_scope) and j < len(m2_v_scope):
        v_len1 = len(m1_v_scope[i][1].split())
        v_len2 = len(m2_v_scope[j][1].split())

        # if(v_len1 == 0 or v_len2 == 0):
        # [["temp", "1global 2selection"]]
        #if min(v_len1, v_len2) / max(v_len1, v_len2) >= Config.dataFlowSimilaritythreshold:
        if max(v_len1, v_len2) > 0:
          if min(v_len1, v_len2) / max(v_len1, v_len2) >= dataFlowSimilaritythreshold:
            similarity = stringMatching(m1_v_scope[i][1], m2_v_scope[j][1])

            #if(similarity >= Config.dataFlowSimilaritythreshold):
            if(similarity >= dataFlowSimilaritythreshold):
                clone_count_variables += 1

            i += 1
            j += 1
          elif v_len1 > v_len2:
            i += 1
          else:
            j += 1

    i = 0
    j = 0
    while i < len(m1_mc_scope) and j < len(m2_mc_scope):

        mc_len1 = len(m1_mc_scope[i][1].split())
        mc_len2 = len(m2_mc_scope[j][1].split())

        #if min(mc_len1, mc_len2) / max(mc_len1, mc_len2) >= Config.dataFlowSimilaritythreshold:
        if max(mc_len1, mc_len2) > 0:
          if min(mc_len1, mc_len2) / max(mc_len1, mc_len2) >= dataFlowSimilaritythreshold:
            similarity = stringMatching(m1_mc_scope[i][1], m2_mc_scope[j][1])
            if similarity >= dataFlowSimilaritythreshold:
            #if similarity >= Config.dataFlowSimilaritythreshold:
                clone_count_method_calls += 1
            i += 1
            j += 1
          elif mc_len1 > mc_len2:
            i += 1
          else:
            j += 1

    similarityVariables = clone_count_variables / \
        total_count_variables if total_count_variables != 0 else 1
    similarityMethods = clone_count_method_calls / \
        total_count_method_calls if total_count_method_calls != 0 else 1

    return similarityVariables, similarityMethods


def dataFlowGenerator(method_lines, identifiers, method_calls, file_info):
    #print("identfiers ", identifiers)
    identifier_scope = [[identifiers[i], ""] for i in range(len(identifiers))]
    method_calls_scope = [[method_calls[i], ""]
                          for i in range(len(method_calls))]

    assert(len(identifiers) == len(identifier_scope))
    assert(len(method_calls) == len(method_calls_scope))

    scope_stack, parenthesis_stack = [], []
    level = 0
    scope = "global"

    method_lines = parenthesisBalancer(method_lines)
    # print(Mapping.delimiters)
    new_delimeters = Mapping.delimiters + ['.']
    # print(new_delimeters)
    for line in method_lines:
        line = re.sub(r"(\".*?\"|\'.*?\')", " STRING_LITERAL ", line)
        regexPattern = '|'.join(map(re.escape, new_delimeters))
        lst_line = re.sub('(?<=\W|\w)(' + regexPattern + ')',
                          r' \1 ', line).split()
        lst_line = [unit.strip() for unit in lst_line if unit.strip() != ""]

        for unit in lst_line:
            unit = unit.strip()
            unit = re.sub(r"^[+-]?((\d*(\.\d*)?)|(\.\d*))$",
                          "INTEGER_LITERAL", unit)

            for keyword in keywords:
                if(unit == keyword):
                    scope = cf_mapping[keyword]
                    break

            if unit == '{':
                scope_stack.append(scope)
                parenthesis_stack.append('{')
                level += 1

            if unit == '}':
                # print(scope_stack)
                # print(parenthesis_stack)
                if(len(scope_stack)):
                    scope_stack.pop()
                if(len(scope_stack) > 0):
                    scope = scope_stack[-1]
                if(len(parenthesis_stack) > 0):
                    parenthesis_stack.pop()
                level -= 1

            for identifier in identifiers:
                if(identifier == unit):
                    index = identifiers.index(identifier)

                    # if(len(identifier_scope[index]) == 0):
                    #     identifier_scope[index].append(identifier)
                    #     identifier_scope[index].append(str(level) + scope)

                    # else:
                    identifier_scope[index][1] = identifier_scope[index][1] + \
                        " " + str(level) + scope

            for method_call in method_calls:
                if(method_call == unit):
                    index = method_calls.index(method_call)

                    # if(len(method_calls_scope[index]) == 0):
                    #     method_calls_scope[index].append(method_call)
                    #     method_calls_scope[index].append(str(level) + scope)

                    # else:
                    method_calls_scope[index][1] = method_calls_scope[index][1] + " " + str(
                        level) + scope

    return identifier_scope, method_calls_scope


In [6]:
keywordsList = """abstract continue for new switch assert default goto 
package synchronized boolean do if private this break double implements 
protected throw byte import public throws case enum	instanceof return 
transient catch extends int short try char final interface static void
class finally long strictfp volatile const float native super while String
STRING_LITERAL INTEGER_LITERAL
""".split()
mapping = { keywordsList[i] : "TOKEN" + str(i) for i in range(0, len(keywordsList) ) }
keywords = {keywordsList[i]: keywordsList[i]
            for i in range(0, len(keywordsList))}
symbols = ["", "+", "-", "*", "/", " ", "{", "}", ";", ":", ".",
           "\t", "\n", ",", "(", ")", "[", "]", "=", ">", "<", " ", "!", "\\", "|", "&", "%", "^", "~", "`", "?"]
delimiters = ["+", "-", "*", "/", "{", "}", ";", "\t",
              ":", ",", "(", ")", "[", "]", "=", ">", "<", "!", "\\", "|", "&", "%", '^', "~", "`", "?"]


In [7]:
#import GetFiles
import data_extraction
import CloneDetector
#import CloneSave

#import ml
# save char2vec with diff name and load clustering model pickle file
# allFilesData is list which have all files with specific extension
print("Getting all file info from folder")
dirPath = "/Users/vivekgoud/Documents/GitHub/Test_project_Codeclonetracer/onlinebookstore-J2EE/"
allFilesData= data_extraction.getAllFilesUsingFolderPath(dirPath)
print("Extracting methods from files")

current_dataset,linesofcode,codeclonelines= data_extraction.extractMethodsAllFiles(allFilesData)


print(linesofcode,"total lines",codeclonelines,"total cloned lines", (codeclonelines/linesofcode)*100 , "cloning percentage")
print("Saving to CSV")
# CloneSave.writeToFile(codeBlocks)
#CloneSave.writeToCSV(codeBlocks)


#pip install python-Levenshtein

#pip install pydriller
#pip install fuzzywuzzy
#pip install pandas
#pip install javalang

Getting all file info from folder
Extracting methods from files


KeyboardInterrupt: 