In [1]:
from py4j.java_gateway import JavaGateway, CallbackServerParameters

In [2]:
# connect to the JVM
gateway = JavaGateway(callback_server_parameters=CallbackServerParameters())  

In [3]:
# LambdaWrapper: a wrapper for lambda expression in python. This wrapper will be converted to the Function class in the Java side
class LambdaWrapper(object):

    def __init__(self, f):
        self._f = f

    def apply(self, arg):
        return self._f(arg)

    class Java:
        implements = ['java.util.function.Function']

In [4]:
class Pipeline(object):
    
    memPs = dict()
    count = 0
    
    def __init__(self):
        global gateway
        self.gateway = gateway
        # map the python-side Pipeline wrapper to a index
        Pipeline.memPs[self] = Pipeline.count
        Pipeline.count += 1
        # call the java-side wrapper to create a real Pipeline identified by the index
        #gateway.entry_point.create_MemPipeline()
        gateway.jvm.CS239.crunch_in_python.Pipeline_.getMemPipelineInstance_()
        
    def readTextFile(self, inputPath):
        # retrieve the index of the Pipeline wrapper
        index = Pipeline.memPs[self]
        # call the java-side wrapper to execute the readTextFile method for the real Pipeline identified by the index
        gateway.jvm.CS239.crunch_in_python.Pipeline_.readTextFile_(index, inputPath)
        # return a PCollection wrapper to represent the result of the readTextFile
        return PCollection()
    
    def writeTextFile(self, data, outputPath):
        # retrieve the index of the Pipeline wrapper
        index_pipeline = Pipeline.memPs[self]
        
        if isinstance(data, PTable):
            # retrieve the index of the wrapper
            index_data = PTable.pTables[data]
            # call the java-side wrapper to execute the writeTextFile method for the real Pipeline identified by the index
            gateway.jvm.CS239.crunch_in_python.Pipeline_.writeTextFile_(index_pipeline, index_data, outputPath, "PTable")
        elif isinstance(data, PCollection):
            # retrieve the index of the wrapper
            index_data = PCollection.pCols[data]
            # call the java-side wrapper to execute the writeTextFile method for the real Pipeline identified by the index
            gateway.jvm.CS239.crunch_in_python.Pipeline_.writeTextFile_(index_pipeline, index_data, outputPath, "PCollection")
        
    def done(self):
        # retrieve the index of the Pipeline wrapper
        index = Pipeline.memPs[self]
        # call the java-side wrapper to execute the writeTextFile method for the real Pipeline identified by the index
        gateway.jvm.CS239.crunch_in_python.Pipeline_.done_(index)

In [5]:
class PCollection(object):
    
    pCols = dict()
    count = 0
    
    def __init__(self):
        global gateway
        self.gateway = gateway
        # map the python-side PCollection wrapper to a index
        PCollection.pCols[self] = PCollection.count
        PCollection.count += 1
        
    def parallelDo(self, lambda_exp):
        # wrap the lambda expression
        lambdaWrapper = LambdaWrapper(lambda_exp)
        # retrieve the index of the PCollection wrapper
        index = PCollection.pCols[self]
        # ask the java-side wrapper to execute the parallelDo method for the real PCollection identified by the index
        gateway.jvm.CS239.crunch_in_python.PCollection_.parallelDo_(index, lambdaWrapper)
        # return a PCollection wrapper to represent the result of the paralleDo
        return PCollection()
    
    def aggregate_count(self):
        # retrieve the index of the PCollection wrapper
        index = PCollection.pCols[self]
        # ask the java-side wrapper to execute the count method for the real PCollection identified by the index
        gateway.jvm.CS239.crunch_in_python.PCollection_.count_(index)
        # return a PTable wrapper to represent the result of the paralleDo
        return PTable()
    
    def max(self):
        # retrieve the index of the PCollection wrapper
        index = PCollection.pCols[self]
        # ask the java-side wrapper to execute the max method for the real PCollection identified by the index
        gateway.jvm.CS239.crunch_in_python.PCollection_.max_(index)
        # return a PObject wrapper to represent the result of the paralleDo
        return PObject()
    
    def tokenize(self):
        # retrieve the index of the PCollection wrapper
        index = PCollection.pCols[self]
        # ask the java-side wrapper to execute the tokenize method for the real PCollection identified by the index
        gateway.jvm.CS239.crunch_in_python.PCollection_.tokenize_(index)
        # return a PCollection wrapper to represent the result of the paralleDo
        return PCollection()
    
    def toDouble(self):
        # retrieve the index of the PCollection wrapper
        index = PCollection.pCols[self]
        # ask the java-side wrapper to execute the toDouble method for the real PCollection identified by the index
        gateway.jvm.CS239.crunch_in_python.PCollection_.toDouble_(index)
        # return a PCollection wrapper to represent the result of the paralleDo
        return PCollection()

In [6]:
class PTable(object):
    
    pTables = dict()
    count = 0
    
    def __init__(self):
        global gateway
        self.gateway = gateway
        # map the python-side PTable wrapper to a index
        PTable.pTables[self] = PTable.count
        PTable.count += 1  
    

In [7]:
class PObject(object):
    
    pObjects = dict()
    count = 0
    
    def __init__(self):
        global gateway
        self.gateway = gateway
        # map the python-side PTable wrapper to a index
        PObject.pObjects[self] = PObject.count
        PObject.count += 1  
        
    def getValue(self):
        # retrieve the index of the PCollection wrapper
        index = PObject.pObjects[self]
        # ask the java-side wrapper to execute the toDouble method for the real PCollection identified by the index
        value = gateway.jvm.CS239.crunch_in_python.PObject.getValue_(index)
        # return a PCollection wrapper to represent the result of the paralleDo
        return value

In [8]:
# test 1:
'''
pipeline = Pipeline()
inputPath = "C:\\Users\\Tim\\Documents\\cs_239_big_data_system\\crunch_in_python\\input.txt"
outputPath = "C:\\Users\\Tim\\Documents\\cs_239_big_data_system\\crunch_in_python\\output"
lines = pipeline.readTextFile(inputPath)
words = lines.tokenize()
counts = words.aggregate_count()
pipeline.writeTextFile(counts, outputPath)
pipeline.done()
gateway.close_callback_server()
'''


'\npipeline = Pipeline()\ninputPath = "C:\\Users\\Tim\\Documents\\cs_239_big_data_system\\crunch_in_python\\input.txt"\noutputPath = "C:\\Users\\Tim\\Documents\\cs_239_big_data_system\\crunch_in_python\\output"\nlines = pipeline.readTextFile(inputPath)\nwords = lines.tokenize()\ncounts = words.aggregate_count()\npipeline.writeTextFile(counts, outputPath)\npipeline.done()\ngateway.close_callback_server()\n'

In [9]:
# test 2:
pipeline = Pipeline()
inputPath = "C:\\Users\\Tim\\Documents\\cs_239_big_data_system\\crunch_in_python\\input.txt"
outputPath = "C:\\Users\\Tim\\Documents\\cs_239_big_data_system\\crunch_in_python\\output"
lines = pipeline.readTextFile(inputPath)

In [10]:
data = lines.tokenize().toDouble()

In [11]:
data2 = data.parallelDo(lambda d : 2*d)

In [12]:
maxValue = data2.max()

In [13]:
#print(maxValue.getValue())

In [14]:
pipeline.writeTextFile(data2, outputPath)

In [15]:
pipeline.done()

In [16]:
gateway.close_callback_server()