Permalink
Browse files

Add python and c/cpp tokenizers (#3)

* add python and c tokenizers. Required packages for tokenizers not installed

* add python and c tokenizers. Add instruction for install and usage. Some minor modifications to c and python tokenizers

* progress on install/integrate c & python tokenizers

* finish integration
  • Loading branch information...
tushargr authored and bmcutler committed Jun 20, 2018
1 parent 8238f69 commit 00348500a1fbd01f6a14d54d399f6e8f73034e9b
@@ -52,12 +52,19 @@ def hasher(args,my_tokenized_file,my_hashes_file):
if args.plaintext:
for j in range(0,args.window):
foo+=str(tokens[i+j].get("value"))

elif args.python:
print("NEED A PYTHON HASHER")
for j in range(0,args.window):
foo+=str(tokens[i+j].get("type"))

elif args.cpp:
print("NEED A C++ HASHER")
for j in range(0,args.window):
foo+=str(tokens[i+j].get("type"))

else:
print("UNKNOWN HASHER")
print("\n\nERROR: UNKNOWN HASHER\n\n")
exit(1)

hash_object = hashlib.md5(foo.encode())
hash_object_string=hash_object.hexdigest()
#FIXME: this truncation should be adjusted after more full-scale testing
@@ -3,11 +3,13 @@
semester=$1
course=$2
gradeable=$3
window=$4
language=$4
window=$5


/usr/local/submitty/Lichen/bin/concatenate_all.py $semester $course $gradeable
/usr/local/submitty/Lichen/bin/tokenize_all.py $semester $course $gradeable --plaintext
/usr/local/submitty/Lichen/bin/hash_all.py $semester $course $gradeable --window $window --plaintext
/usr/local/submitty/Lichen/bin/tokenize_all.py $semester $course $gradeable --${language}
/usr/local/submitty/Lichen/bin/hash_all.py $semester $course $gradeable --window $window --${language}

/usr/local/submitty/Lichen/bin/compare_hashes.out $semester $course $gradeable --window $window

@@ -34,14 +34,27 @@ def tokenize(args,my_concatenated_file,my_tokenized_file):
if args.plaintext:
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","plaintext_tokenizer.out")
with open(my_concatenated_file,'r') as infile:
with open (my_tokenized_file,'w')as outfile:
with open (my_tokenized_file,'w') as outfile:
subprocess.call([tokenizer,"--ignore_newlines"],stdin=infile,stdout=outfile)

elif args.python:
print("NEED A PYTHON TOKENIZER")
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","python_tokenizer.py")
with open(my_concatenated_file,'r') as infile:
with open (my_tokenized_file,'w') as outfile:
command="python3 "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
os.system(command)

elif args.cpp:
print("NEED A C++ TOKENIZER")
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","c_tokenizer.py")
with open(my_concatenated_file,'r') as infile:
with open (my_tokenized_file,'w') as outfile:
command="python "+str(tokenizer)+" "+my_concatenated_file+" > "+my_tokenized_file
os.system(command)

else:
print("UNKNOWN TOKENIZER")
print("\n\nERROR: UNKNOWN TOKENIZER\n\n")
exit(1)


def main():
args = parse_args()
@@ -27,10 +27,13 @@ fi


########################################################################################################################
# compile & install the tokenizers
# compile & install the tools

mkdir -p ${lichen_installation_dir}/bin


#--------------------
# plaintext tool
pushd ${lichen_repository_dir} > /dev/null
clang++ -I ${nlohmann_dir}/include/ -std=c++11 -Wall tokenizer/plaintext/plaintext_tokenizer.cpp -o ${lichen_installation_dir}/bin/plaintext_tokenizer.out
if [ $? -ne 0 ]; then
@@ -40,6 +43,7 @@ fi
popd > /dev/null


#-------------------------------------------
# compile & install the hash comparison tool
pushd ${lichen_repository_dir} > /dev/null
clang++ -I ${nlohmann_dir}/include/ -lboost_system -lboost_filesystem -Wall -g -std=c++11 -Wall compare_hashes/compare_hashes.cpp -o ${lichen_installation_dir}/bin/compare_hashes.out
@@ -54,6 +58,9 @@ popd > /dev/null

cp ${lichen_repository_dir}/bin/* ${lichen_installation_dir}/bin/

cp ${lichen_repository_dir}/tokenizer/c/c_tokenizer.py ${lichen_installation_dir}/bin/c_tokenizer.py
cp ${lichen_repository_dir}/tokenizer/python/python_tokenizer.py ${lichen_installation_dir}/bin/python_tokenizer.py


########################################################################################################################
# fix permissions
@@ -0,0 +1,8 @@
Installation Instruction:-

sudo apt-get install python-clang-3.8

Usage:-

python c_tokenizer.py path/to/inputfile

@@ -0,0 +1,36 @@
import clang.cindex
import json
import sys
import shutil
import tempfile
import os


# apparently, the file name must end in .cpp (or some standard
# c/c++ suffix to be successfully tokenized)

# make a temprary filename
tmp_cpp_file_handle,tmp_cpp_file_name=tempfile.mkstemp(suffix=".cpp")
# copy the concatenated file to the temporary file location
shutil.copy(sys.argv[1],tmp_cpp_file_name)

clang.cindex.Config.set_library_file("/usr/lib/llvm-3.8/lib/libclang-3.8.so.1")
idx = clang.cindex.Index.create()

# parse the input file
parsed_data = idx.parse(tmp_cpp_file_name)

# remove the temporary file
os.remove(tmp_cpp_file_name)

tokens = []

for token in parsed_data.get_tokens(extent = parsed_data.cursor.extent):
tmp = dict()
tmp["line"]=int(token.location.line)
tmp["char"]=int(token.location.column)
tmp["type"]=(str(token.kind))[10:]
tmp["value"]=str(token.spelling)
tokens.append(tmp)

print ( json.dumps(tokens, indent=4, sort_keys=True) )
Oops, something went wrong.

0 comments on commit 0034850

Please sign in to comment.