In [50]:
# 2. import libraries
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import sys, os, shutil
import numpy as np
import time
import argparse
import _pickle as cpickle
import re
import json

In [51]:
path = '/home/hdita/Documents/SparkDev Files/tree2tree_data/tree2tree_data/CS-JS/BL/progs_train.json'


In [52]:
def preprocess_cs_AST(lines, start_idx):
	root_line = lines[start_idx]
	root_line = root_line[:-1]
	current_tree = {}
	start_pos = 0
	while not root_line[start_pos].isalpha():
		start_pos += 1
	if ':' in root_line:
		split_point = root_line.rfind(':')
		current_tree['root'] = root_line[start_pos:split_point]
		current_tree['children'] = [{'root': root_line[split_point+2:], 'children': []}]
		return current_tree, start_idx + 1
	current_tree['root'] = root_line[start_pos:]
	current_tree['children'] = []
	tot_lines = len(lines)
	current_idx = start_idx + 1
	while current_idx < tot_lines:
		current_line = lines[current_idx]
		current_start_pos = 0
		while not current_line[current_start_pos].isalpha():
			current_start_pos += 1
		if current_start_pos > start_pos:
			child_tree, next_idx = preprocess_cs_AST(lines, current_idx)
			current_tree['children'].append(child_tree)
			current_idx = next_idx
		else:
			break
	return current_tree, current_idx	

def find_js_AST_root(raw_tree):
	if raw_tree["type"] == 'Program':
		return find_js_AST_root(raw_tree['body'][0])
	if raw_tree["type"] == 'BlockStatement':
		return raw_tree
	if raw_tree["type"] == 'ExpressionStatement':
		return find_js_AST_root(raw_tree["expression"])
	if raw_tree["type"] == 'CallExpression':
		return find_js_AST_root(raw_tree["callee"])
	if raw_tree["type"] == 'MemberExpression':
		return find_js_AST_root(raw_tree['object'])
	if raw_tree["type"] == 'FunctionExpression':
		return find_js_AST_root(raw_tree['body'])

def preprocess_js_AST(raw_tree):
	current_tree = {}
	if raw_tree is None:
		return current_tree
	if raw_tree["type"] == 'Program':
		current_tree['root'] = 'Program'
		current_tree['children'] = []
		for raw_child in raw_tree["body"]:
			child = preprocess_js_AST(raw_child)
			if len(child) > 0:
				current_tree["children"].append(child)
		return current_tree
	if raw_tree['type'] == 'ExpressionStatement':
		current_tree['root'] = 'ExpressionStatement'
		current_tree['children'] = [preprocess_js_AST(raw_tree['expression'])]
		return current_tree
	if raw_tree['type'] == 'CallExpression':
		current_tree['root'] = 'CallExpression'
		current_tree['children'] = [preprocess_js_AST(raw_tree['callee'])]
		return current_tree
	if raw_tree['type'] == 'MemberExpression':
		current_tree['root'] = 'MemberExpression'
		current_tree['children'] = [preprocess_js_AST(raw_tree['object'])]
		return current_tree
	if raw_tree["type"] == 'FunctionExpression':
		current_tree['root'] = 'FunctionExpression'
		current_tree['children'] = [preprocess_js_AST(raw_tree['body'])]
		return current_tree
	if raw_tree["type"] == 'ArrayExpression':
		current_tree['root'] = 'ArrayExpression'
		current_tree['children'] = []
		for raw_child in raw_tree["elements"]:
			child = preprocess_js_AST(raw_child)
			if len(child) > 0:
				current_tree["children"].append(child)
		return current_tree	
	if raw_tree["type"] == 'ReturnStatement':
		current_tree['root'] = 'ReturnStatement'
		current_tree['children'] = [preprocess_js_AST(raw_tree['argument'])]
		return current_tree	
	if raw_tree["type"] == 'VariableDeclaration':
		current_tree['root'] = 'VariableDeclaration'
		current_tree['children'] = []
		for raw_child in raw_tree['declarations']:
			child = preprocess_js_AST(raw_child)
			if len(child) > 0:
				current_tree["children"].append(child)
		return current_tree
	if raw_tree["type"] == 'VariableDeclarator':
		current_tree['root'] = 'VariableDeclarator'
		current_tree['children'] = [preprocess_js_AST(raw_tree['id'])]
		return current_tree
	if raw_tree["type"] == 'BlockStatement':
		current_tree["root"] = 'BlockStatement'
		current_tree["children"] = []
		for raw_child in raw_tree["body"]:
			child = preprocess_js_AST(raw_child)
			if len(child) > 0:
				current_tree["children"].append(child)
		return current_tree
	if raw_tree["type"] == 'IfStatement':
		current_tree["root"] = 'IfStatement'
		current_tree["children"] = []
		current_tree["children"].append(preprocess_js_AST(raw_tree['test']))
		current_tree["children"].append(preprocess_js_AST(raw_tree['consequent']))
		child = preprocess_js_AST(raw_tree['alternate'])
		if len(child) > 0:
			current_tree['children'].append(child)
		return current_tree
	if raw_tree["type"] == 'WhileStatement':
		current_tree['root'] = 'WhileStatement'
		current_tree['children'] = []
		current_tree["children"].append(preprocess_js_AST(raw_tree['test']))
		current_tree["children"].append(preprocess_js_AST(raw_tree['body']))
		return current_tree
	if 'operator' in raw_tree:
		current_tree['root'] = raw_tree['operator']
		current_tree['children'] = []
		current_tree['children'].append(preprocess_js_AST(raw_tree['left']))
		current_tree['children'].append(preprocess_js_AST(raw_tree['right']))
		return current_tree
	if raw_tree['type'] == 'Identifier':
		current_tree['root'] = 'Identifier'
		current_tree['children'] = [{'root': raw_tree['name'], 'children': []}]
		return current_tree
	if raw_tree['type'] == 'Literal':
		current_tree['root'] = 'Literal'
		current_tree['children'] = [{'root': str(raw_tree['value']), 'children': []}]
		return current_tree
	else:
		print(raw_tree)
		quit()


def preprocess_AST(init_tree, lang):
	if lang == 'coffee':
		current_tree, _ = preprocess_cs_AST(init_tree, 0)
	if lang == 'js':
		init_tree = json.load(init_tree)
		init_tree = find_js_AST_root(init_tree)
		current_tree = preprocess_js_AST(init_tree)




In [53]:
with open(path, 'r') as f:
    preprocess_AST(f, 'coffee')

TypeError: '_io.TextIOWrapper' object is not subscriptable