Skip to content

Commit

Permalink
pipeline work
Browse files Browse the repository at this point in the history
  • Loading branch information
ukclivecox committed Aug 8, 2015
1 parent 525de68 commit 9392281
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 68 deletions.
125 changes: 91 additions & 34 deletions external/predictor/python/seldon/fileutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,73 +12,58 @@

class FileUtil:

def __init__(self, key = None, secret = None):
self.key = key
self.secret = secret

def stream_decompress(self,stream):
dec = zlib.decompressobj(16+zlib.MAX_WBITS) # same as gzip module
for chunk in stream:
rv = dec.decompress(chunk)
if rv:
yield rv

def stream_text(self,k,cl):
def stream_text(self,k,fn):
unfinished = ""
for data in k:
data = unfinished + data
lines = data.split("\n");
unfinished = lines.pop()
for line in lines:
cl.process(line)
fn(line)

def stream_gzip(self,k,cl):
def stream_gzip(self,k,fn):
unfinished = ""
for data in self.stream_decompress(k):
data = unfinished + data
lines = data.split("\n");
unfinished = lines.pop()
for line in lines:
cl.process(line)
fn(line)

'''
Local File Stream
'''
class LocalFileUtil(FileUtil):

def getFolders(self,baseFolder,startDay,numDays):
folders = []
for day in range(startDay-numDays+1,startDay+1):
folders.append(baseFolder+str(day)+"/*")
return folders


def stream(self,folders,cl):
def stream_local(self,folders,fn):
for folder in folders:
for f in glob.glob(folder):
k = open(f,"r")
if f.endswith(".gz"):
self.stream_gzip(k,cl)
self.stream_gzip(k,fn)
else:
self.stream_text(k,cl)
self.stream_text(k,fn)



def copy(self,fromPath,toPath):
def copy_local(self,fromPath,toPath):
print "copy ",fromPath,"to",toPath
dir = os.path.dirname(toPath)
if not os.path.exists(dir):
if len(dir) > 0 and not os.path.exists(dir):
os.makedirs(dir)
copyfile(fromPath,toPath)

'''
AWS S3 File Stream
'''
class S3FileUtil(FileUtil):

def __init__(self, key = None, secret = None):
self.key = key
self.secret = secret
if key:
self.conn = boto.connect_s3(key,secret)
else:
self.conn = boto.connect_s3()

def getGlob(self,startDay,numDays):
g = "{" + str(startDay)
Expand All @@ -87,16 +72,24 @@ def getGlob(self,startDay,numDays):
g += "}"
return g

def stream(self,bucket,prefix,cl):
def stream_s3(self,bucket,prefix,fn):
if self.key:
self.conn = boto.connect_s3(self.key,self.secret)
else:
self.conn = boto.connect_s3()
b = self.conn.get_bucket(bucket)
for k in b.list(prefix=prefix):
print k.name
if k.name.endswith(".gz"):
self.stream_gzip(k,cl)
self.stream_gzip(k,fn)
else:
self.stream_text(k,cl)
self.stream_text(k,fn)

def copy(self,fromPath,bucket,path):
def copy_s3(self,fromPath,bucket,path):
if self.key:
self.conn = boto.connect_s3(self.key,self.secret)
else:
self.conn = boto.connect_s3()
print fromPath, bucket, path
b = self.conn.get_bucket(bucket)
source_size = os.stat(fromPath).st_size
Expand All @@ -115,5 +108,69 @@ def copy(self,fromPath,bucket,path):
# Finish the upload
print "completing transfer to s3"
mp.complete_upload()
# k = b.new_key(path)
# k.set_contents_from_filename(fromPath)

def download_s3(self,bucket,s3path,localPath):
if self.key:
self.conn = boto.connect_s3(self.key,self.secret)
else:
self.conn = boto.connect_s3()
print bucket, s3path, localPath
b = self.conn.get_bucket(bucket)
key = b.get_key(s3path)
key.get_contents_to_filename(localPath)

def stream(self,inputPath,fn):
if inputPath.startswith("s3n://"):
isS3 = True
inputPath = inputPath[6:]
elif inputPath.startswith("s3://"):
isS3 = True
inputPath = inputPath[5:]
else:
isS3 = False
if isS3:
print "AWS S3 input path ",inputPath
parts = inputPath.split('/')
bucket = parts[0]
prefix = inputPath[len(bucket)+1:]
self.stream_s3(bucket,prefix,fn)
else:
folders = [inputPath+"*"]
print "local input folders: ",folders
self.stream_local(folders,fn)

def upload(self,path,outputPath):
if outputPath.startswith("s3n://"):
noSchemePath = outputPath[6:]
isS3 = True
elif outputPath.startswith("s3://"):
noSchemePath = outputPath[5:]
isS3 = True
else:
isS3 = False
if isS3:
parts = noSchemePath.split('/')
bucket = parts[0]
opath = noSchemePath[len(bucket)+1:]
self.copy_s3(path,bucket,opath)
else:
self.copy_local(path,outputPath)

def download(self,fromPath,toPath):
if fromPath.startswith("s3n://"):
isS3 = True
fromPath = fromPath[6:]
elif fromPath.startswith("s3://"):
isS3 = True
fromPath = inputPath[5:]
else:
isS3 = False
if isS3:
print "AWS S3 input path ",fromPath
parts = fromPath.split('/')
bucket = parts[0]
prefix = fromPath[len(bucket)+1:]
self.download_s3(bucket,prefix,toPath)
else:
self.copy_local(fromPath,toPath)

46 changes: 46 additions & 0 deletions external/predictor/python/seldon/pipeline/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import seldon.fileutil as fu

class Feature_transform(object):

def upload(self,fromPath,fromPath):
fu.save(fromPath,toPath)

def download(self,fromPath,toPath):
fu.download(fromPath,toPath)

def save(self,folder):
print "no model to save"

def load(self,folder):
print "no model to load"


class Pipeline(object):

def __init__(self,models_folder="./models"):
self.pipeline = []
self.models_folder = models_folder
self.objs = []

def add(self,feature_transform):
self.pipeline.append(feature_transform)

def process(self,line):
j = json.loads(line)
self.objs.append(j)

def getFeatures(self,location):
fu.stream(location,self.process)

def transform(self,featureLocation):
self.getFeatures(featuresLocation)
for ft in pipeline:
ft.load(self.models_folder)
objs = ft.transform(objs)

def fit_transform(self):
self.getFeatures(featuresLocation)
for ft in pipeline:
ft.fit(objs)
objs = ft.transform(objs)
ft.save(self.models_folder)
40 changes: 9 additions & 31 deletions external/predictor/python/seldon/vw.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,49 +115,27 @@ def train(self,client,conf):
self.create_vw(conf)
self.features = conf.get('features',{})
self.fns = conf.get('namespaces',{})
#stream data into vw
inputPath = conf["inputPath"] + "/" + client + "/features/" + str(conf['day']) + "/"
print "inputPath->",inputPath
if inputPath.startswith("s3n://"):
isS3 = True
inputPath = inputPath[6:]
elif inputPath.startswith("s3://"):
isS3 = True
inputPath = inputPath[5:]
else:
isS3 = False
if isS3:
fileUtil = S3FileUtil(self.awsKey,self.awsSecret)
print "AWS S3 input path ",inputPath
parts = inputPath.split('/')
bucket = parts[0]
prefix = inputPath[len(bucket)+1:]
fileUtil.stream(bucket,prefix,self)
else:
fileUtil = LocalFileUtil()
folders = [inputPath+"*"]
print "local input folders: ",folders
fileUtil.stream(folders,self)
fileUtil = FileUtil(key=self.awsKey,secret=self.awsSecret)
fileUtil.stream(inputPath,self.process)
# save vw model
self.vw2.save_model("./model")
self.vw2.close()
print "lines processed ",self.numLinesProcessed
# push model to output path on s3 or local
# copy models to final location
outputPath = conf["outputPath"] + "/" + client + "/vw/" + str(conf["day"])
print "outputPath->",outputPath
if outputPath.startswith("s3n://"):
isS3 = True
else:
isS3 = False
if isS3:
noSchemePath = outputPath[6:]
parts = noSchemePath.split('/')
bucket = parts[0]
path = noSchemePath[len(bucket)+1:]
fileUtil = S3FileUtil(self.awsKey,self.awsSecret)
fileUtil.copy("./model",bucket,path+"/model")
fileUtil.copy("./model.readable",bucket,path+"/model.readable")
else:
fileUtil = LocalFileUtil()
fileUtil.copy("./model",outputPath+"/model")
fileUtil.copy("./model.readable",outputPath+"/model.readable")

fileUtil.upload("./model",outputPath+"/model")
fileUtil.upload("./model.readable",outputPath+"/model.readable")

if "activate" in conf and conf["activate"]:
self.activateModel(client,str(outputPath))
6 changes: 3 additions & 3 deletions scripts/zookeeper/set-client-config.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ def activateModel(args,folder,zk):

for line in sys.stdin:
line = line.rstrip()
parts = line.split('\t')
if len(parts) == 3 and not line.startswith("#"):
parts = line.split()
if not line.startswith("#"):
clients = parts[0].split(',')
node = parts[1]
value = parts[2]
value = " ".join(parts[2:])
print "--------------------------"
print parts[0],node,"->",value
for client in clients:
Expand Down

0 comments on commit 9392281

Please sign in to comment.