In [162]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Model

In [23]:
!pip install javalang
!pip install python-Levenshtein

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
import pandas as pd
import re
import javalang
import numpy as np
import Levenshtein

In [150]:
class CodeAnalyzer:
    def __init__(self, code):
        self.code = code
        self.tree = javalang.parse.parse(self.code)
        self.method_name = []
        self.method_return_type = []
        self.loops = []
        self.controlStruct =[]
        self.nesting_depth = 0
        self.variable_name=[]
        self.variable_datatype=[]

    def analyze_method(self):
        for path,node in self.tree.filter(javalang.tree.MethodDeclaration):
          self.method_name.append(node.name)
          if(node.return_type):
            self.method_return_type.append(node.return_type.name)
          else:
            self.method_return_type.append(node.return_type)

    def analyze_variables(self):
        for path,node in self.tree.filter(javalang.tree.VariableDeclaration):
          self.variable_datatype.append(node.type.name)
          for variable_declarator in node.declarators:
            self.variable_name.append(variable_declarator.name)

    def analyze_loops(self):
        for path,node in self.tree:
          if isinstance(node, javalang.tree.ForStatement):
                  self.loops.append('For')
          if isinstance(node, javalang.tree.DoStatement):
                  self.loops.append('Do')
          if isinstance(node, javalang.tree.WhileStatement):
                  self.loops.append('While')

    def analyze_control_structures(self):
        for path,node in self.tree:
          if isinstance(node, javalang.tree.IfStatement):
                  self.controlStruct.append('If')
          if isinstance(node, javalang.tree.SwitchStatement):
                  self.controlStruct.append('Switch')

    def analyze_nesting_depth(self):
      pass

    def find_loc(self):
      #no multiline comments
      clean_code=re.sub(r'/\*(.*?)\*/', '', self.code, flags=re.DOTALL)
      #no tabs
      clean_code=re.sub(r'\t',' ',clean_code)
      #no empty lines
      clean_code = re.sub(r'\n(\s*)\n', '\n', clean_code)
      clean_code= re.sub(r'^\n+','',clean_code)
      #lines of code
      matches=re.findall('\n',clean_code)

      self.clean_code=clean_code
      self.loc=len(matches)

    def analyze_code(self):
        # Analyze the code and populate the variables accordingly
        self.analyze_method()
        self.number_of_methods = len(self.method_name)

        self.analyze_loops()
        self.no_of_loops = len(self.loops)

        self.analyze_control_structures()
        self.no_of_controlStruct = len(self.controlStruct)

        self.analyze_variables()
        self.no_of_variables=len(self.variable_name)

        self.analyze_nesting_depth()
        self.find_loc()





In [151]:
def edit_distance(list1,list2):
  str1=' '.join([str(elem) for elem in list1])
  str2=' '.join([str(elem) for elem in list2])

  return Levenshtein.distance(str1,str2)

In [152]:
from sklearn.metrics.pairwise import cosine_similarity

def cosine_sim(list1,list2):

  v1 = np.array([1 if item in list1 else 0 for item in list2])
  v2 = np.array([1 if item in list2 else 0 for item in list2])

  if len(v1)==0 or len(v2)==0:
    return 0

  similarity = cosine_similarity([v1], [v2])[0][0]

  return similarity

In [153]:
import glob
def file_access():

  files = glob.glob('/content/drive/MyDrive/ML Projects/SourceCode_PlagDetection/IR-Plag-Dataset/*', recursive = True)
  for file in files:
    original_path=glob.glob(file+'/original/*', recursive = True)
    og_file=open(original_path[0],'r')
    original_code = CodeAnalyzer(og_file.read())
    original_code.analyze_code()

    non_plag=glob.glob(file+'/non-plagiarized/*', recursive = True)
    for non_plag_folders in non_plag:
      non_plag_path=glob.glob(non_plag_folders+'/*',recursive=True)
      #non_plag_code stores required file location

      code_file=open(non_plag_path[0],'r')
      code = CodeAnalyzer(code_file.read())
      code.analyze_code()

      buildingDF(original_code,code,0) #0 implies non-plagiarized


    plagiarized=glob.glob(file+'/plagiarized/*', recursive = True)
    for plagiarized_folders in plagiarized:
      plagiarized_code_parent_folder=glob.glob(plagiarized_folders+'/*', recursive = True)
      for plagiarized_code_folder in plagiarized_code_parent_folder:
        plagiarized_path=glob.glob(plagiarized_code_folder+'/*', recursive = True)
        #plagiarized_code stores required file location

        code_file=open(plagiarized_path[0],'r')
        code = CodeAnalyzer(code_file.read())
        code.analyze_code()

        buildingDF(original_code,code,1) #1 implies plagiarized

    del code

In [154]:
def buildingDF(original_code,code,value):

  dloc.append(abs(original_code.loc-code.loc))
  dloops.append(abs(original_code.no_of_loops-code.no_of_loops))
  dmethods.append(abs(original_code.number_of_methods-code.number_of_methods))
  dcontrolStruct.append(abs(original_code.no_of_controlStruct-code.no_of_controlStruct))
  variable_name_sim.append(edit_distance(original_code.variable_name,code.variable_name))
  method_name_sim.append(edit_distance(original_code.method_name,code.method_name))
  variable_dtpye_sim.append(cosine_sim(original_code.variable_datatype,code.variable_datatype))
  method_rtype_sim.append(cosine_sim(original_code.method_return_type,code.method_return_type))
  classification.append(value)

  del code

In [155]:
import pandas as pd

df=pd.DataFrame(columns=['Diff in LOC','Diff in No. of Loops','Class'])

dloc=[]
dloops=[]
dcontrolStruct=[]
dmethods=[]
variable_name_sim=[]
variable_dtpye_sim=[]
method_name_sim=[]
method_rtype_sim=[]
classification=[]


file_access()


df['Diff in LOC']=dloc
df['Diff in No. of Loops']=dloops
df['Diff in No. of Control Structures']=dcontrolStruct
df['Diff in No. of Methods']=dmethods
df['Variable Name Sim']=variable_name_sim
df['Variable Dtype Sim']=variable_dtpye_sim
df['Method Name Sim']=method_name_sim
df['Method Rtype Sim']=method_rtype_sim
df['Class']=classification

Testing Model

In [157]:
import numpy as np

X=df.drop('Class',axis=1)
y=df['Class']
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2,random_state=10)

In [158]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
clf = LogisticRegression(random_state=0)
clf = clf.fit(X_train,y_train)
y_pred2 = clf.predict(X_test)
print(accuracy_score(y_test,y_pred2))

0.8260869565217391


In [161]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt = dt.fit(X_train,y_train)
y_pred2 = dt.predict(X_test)
print(accuracy_score(y_test,y_pred2))

0.8913043478260869
