In [1]:
#--utf-8---
import gzip
import shutil
import pandas as pd
import sys
import io
import json
import re

In [4]:
file = gzip.open('/Users/yurunsong/Desktop/Data/rawCode/python-000000000000.csv.gz') 
pf = pd.read_csv(file, encoding="utf-8")

In [26]:
content = pf["content"][6]
print(content)

import unittest

from thunder import ThunderContext
from thunder import SourceExtraction

from test_utils import PySparkTestCase

_have_sima = False
try:
    import sima
except ImportError:
    pass
else:
    _have_sima = True


class TestBlockMethod(PySparkTestCase):

    def test_nmf(self):
        """
        (BlockMethod) nmf with defaults
        """
        tsc = ThunderContext(self.sc)
        data = tsc.makeExample('sources', dims=(60, 60), centers=[[20, 20], [40, 40]], noise=0.1, seed=42)

        model = SourceExtraction('nmf', componentsPerBlock=1, maxArea=500).fit(data, size=(30, 30))

        assert(model.count == 2)

        # order is irrelevant, but one of these must be true
        ep = 1.0
        cond1 = (model[0].distance([20, 20]) < ep) and (model[1].distance([40, 40]) < ep)
        cond2 = (model[0].distance([40, 40]) < ep) and (model[1].distance([20, 20]) < ep)
        assert(cond1 or cond2)

    @unittest.skipIf(not _have_sima, "SIMA not installed or not functio

## Python structure

In [7]:
'''
Generate Tree structure for Python code
'''
class pythonNode():
    def __init__(self, start, end, name, parent, indent):
        self.start =start
        self.end =end
        self.parent =parent
        self.indent =indent
        self.name = name
        self.child = []

In [8]:
startwith = ["from", "import", "class", "def", "for", "while", "if", "else", "elif"]


def restruct(content, start, end, parent, indent):
    subContent = content[start: end]
    pattern = "\n\s{" + str(indent) + "}[a-zA-Z@\'{3}_#0-9]"
    idx = [(m.start(0), m.end(0)) for m in re.finditer(pattern, subContent)]

    name = ""
    for ele in startwith:
        if subContent[2:].startswith(ele):
            name = ele

    if len(idx) == 1:
        return pythonNode(start, end, parent, name, indent)
    else:
        for i in range(len(idx)):
            if i == 0:
                tmp = pythonNode(parent.start, start+idx[0][0], name, parent, indent)  
                restruct(content, parent.start, start + idx[0][0], tmp, indent + 4)
            elif i == len(idx) - 1:
                tmp = pythonNode(start+idx[-1][0]+1, parent.end, name, parent, indent)
                restruct(content, start + idx[-1][0] + 1, parent.end, tmp, indent + 4)
            else:
                tmp = pythonNode(start+idx[i][0]+1,start+idx[i+1][0],name, parent, indent)
                restruct(content, start + idx[i][0] + 1, start + idx[i + 1][0], tmp, indent + 4)
            parent.child.append(tmp)
        return parent

In [9]:
root = pythonNode(0, len(content),"root", None, -1)

In [10]:
restruct(content,0, len(content), root, 0)

<__main__.pythonNode at 0x17dc1c910>

In [12]:
# print(len(root.child))
# print(content[root.child[-2].start: root.child[-2].end])

## Python Extract Methods

In [13]:
def searchMethods(content):
    idxs_method = [(m.start(0), m.end(0)) for m in re.finditer(r"(?!\n)\s*def\s.*\(.*\):",content)]
    methodRange = []
    for i in idxs_method:
        subContent = content[i[0]:i[1]]
        indent  = numOfIndent(subContent)
        position = findNextIndent(content[i[1]:],indent)
        methodRange.append((i[0],i[1]+position))
    return methodRange

In [14]:
def numOfIndent(line):
    indent = 0
    while line[indent] == " ":
        indent += 1
    return indent

In [15]:
def findNextIndent(content, indent):
    '''
        find next position where the indent is higher in the content
    '''
    start = 0
    while start < len(content):
        if content[start] == "\n":
            nextIndent = start+1+indent
            if content[nextIndent: nextIndent+2] != "  ":
                return start+1
        start += 1
    return start  

In [16]:
def cleanEachUp(content):
    '''
        Clean up extracted methods
    '''
    indent = numOfIndent(content)
    replacement = "\n"+(" "*indent)
    content = content.replace(replacement, "\n") 
    return content.strip().strip("\n")

In [21]:
'''
    Collect Description and DocString
'''
def cleanUpComments(content):
    content = re.sub(r"\s*#\s.*\n","\n", content)
    content = re.sub(r"[\"]{3}\n\s*.*\n\s*[\"]{3}","", content)
    content = re.sub(r"\n{2,}","\n",content)
    return content


def getDescription(content):
    idx = content.find("\n")
    title = content[:idx]
    descriptions = re.findall(r"(#\s.*\n)+",content)
    docString = re.findall(r"[\"]{3}\n\s*.*\n\s*[\"]{3}",content)
    return title, descriptions, docString

In [22]:
out = searchMethods(content)

In [23]:
methods = []
for i in out:
    methods.append(content[i[0]:i[1]])

In [24]:
for i in methods:
    print(cleanEachUp(i))
    print("=====================================================")

def test_nmf(self):
    """
    (BlockMethod) nmf with defaults
    """
    tsc = ThunderContext(self.sc)
    data = tsc.makeExample('sources', dims=(60, 60), centers=[[20, 20], [40, 40]], noise=0.1, seed=42)

    model = SourceExtraction('nmf', componentsPerBlock=1, maxArea=500).fit(data, size=(30, 30))

    assert(model.count == 2)

    # order is irrelevant, but one of these must be true
    ep = 1.0
    cond1 = (model[0].distance([20, 20]) < ep) and (model[1].distance([40, 40]) < ep)
    cond2 = (model[0].distance([40, 40]) < ep) and (model[1].distance([20, 20]) < ep)
    assert(cond1 or cond2)
def test_sima(self):
    """
    (BlockMethod) with SIMA strategy
    """
    # NOTE: this test was brittle and failed non-deterministically with any
    # more than one source
    import sima.segment

    # construct the SIMA strategy
    simaStrategy = sima.segment.STICA(components=1)
    simaStrategy.append(sima.segment.SparseROIsFromMasks(min_size=20))
    simaStrategy.append(sima.segmen

In [45]:
out = getDescription(cleanEachUp(methods[1]))
print(out[0])
print()
print("Descriptions")
for i in out[1]:
    print([i])
print()
print("======================================")
print(cleanUpComments(cleanEachUp(methods[0])))

def test_sima(self):

Descriptions
['# NOTE: this test was brittle and failed non-deterministically with any\n']
['# more than one source\n']
['# construct the SIMA strategy\n']
['# create and fit the thunder extraction strategy\n']
['# check that the one center is recovered\n']

def test_nmf(self):
    
    tsc = ThunderContext(self.sc)
    data = tsc.makeExample('sources', dims=(60, 60), centers=[[20, 20], [40, 40]], noise=0.1, seed=42)
    model = SourceExtraction('nmf', componentsPerBlock=1, maxArea=500).fit(data, size=(30, 30))
    assert(model.count == 2)
    ep = 1.0
    cond1 = (model[0].distance([20, 20]) < ep) and (model[1].distance([40, 40]) < ep)
    cond2 = (model[0].distance([40, 40]) < ep) and (model[1].distance([20, 20]) < ep)
    assert(cond1 or cond2)
