In [13]:
import os, sys, logging
from time import time
from itertools import cycle
from cassandra import ConsistencyLevel
from cassandra.concurrent import execute_concurrent
from cassandra.cluster import Cluster, BatchStatement
from cassandra.query import SimpleStatement


DDoTesting = True
DDebug = True
DReferenceHashTableName = "hash"
DReferenceContentTableName = "sequences"
DFileChunkSize = 1000
DKeySize = 11
DFutureCheck = False
DConcurrentRequest = True
DNumberConcurrentRequest = 200
read_chrs=0
read_data=''

class SparkBlast_CreateReferenceHash:

    def __new__(self):
        self.cluster = None
        self.session = None
        self.keyspace = None
        self.log = None
        self.future = None
        self.Futures = None
        self.ReferenceFilename = None
        self.ReferenceName = None
        self.File = None
        self.FileChunkSize = None
        self.KeySize = None
        self.NumberKeys = None
        self.StartTime = None

    # parameterized constructor 
    def __init__(self, referenceFilename, referenceName, keySize): 
        self.ReferenceFilename = referenceFilename
        self.ReferenceName = referenceName
        self.KeySize = keySize
        self.FileChunkSize = DFileChunkSize
        self.StartTime = time()
        self.NumberKeys = 0  
        self.Futures = []

    def __del__(self):
        self.session.shutdown()
        self.cluster.shutdown()

    def CreateCassandraSession(self):
        #self.cluster = Cluster(['192.168.1.1', '192.168.1.2', '192.168.1.3', '192.168.1.4', '192.168.1.5', '192.168.1.6'])
        self.cluster = Cluster(['192.168.1.1'])
        self.session = self.cluster.connect()
        #self.session = self.cluster.connect(self.keyspace)

    def GetCassandraSession(self):
        return self.session

    # How about Adding some log info to see what went wrong
    def SetLogger(self):
        log = logging.getLogger()
        log.setLevel('INFO')
        handler = logging.StreamHandler()
        handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s"))
        log.addHandler(handler)
        self.log = log

    def CreateHashTable(self):
        self.session.execute("CREATE KEYSPACE IF NOT EXISTS "+ self.ReferenceName +" WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1 };")
        #self.session.execute("DROP TABLE IF EXISTS "+ self.ReferenceName +"."+ DReferenceHashTableName +";")
        #self.session.execute("CREATE TABLE "+ self.ReferenceName + "."+ DReferenceHashTableName +" (blockid int, offset int, size int, value text, PRIMARY KEY(blockID));")
        #if (DDebug):
        #    self.log.info(self.ReferenceName + "."+ DReferenceContentTableName +" Table Created !!!")
        self.session.execute("DROP TABLE IF EXISTS "+ self.ReferenceName +"."+ DReferenceHashTableName +";")
        self.session.execute("CREATE TABLE "+ self.ReferenceName + "."+ DReferenceHashTableName +" (seq text, value list<text>, PRIMARY KEY(seq));")
        if (DDebug):
            self.log.info(self.ReferenceName + "."+ DReferenceHashTableName +" Table Created !!!")


    def ReadFileChunk(self):
        """Lazy function (generator) to read a file piece by piece.
        Default chunk size: 1k."""
        while True:
            data = self.File.read(self.FileChunkSize)
            if not data:
                break
            yield data.replace('\n','').upper()
               

    def WriteReferenceHashes(self):
        self.File = open(self.ReferenceFilename, 'rt')

        # Check for header line
        header = self.File.readline()        
        if (header and header[0]!='>'):
            self.File.seek(0)

        offset = 0
        rest_prev_chunk = ''
        self.InitStatement()
        for chunk in self.ReadFileChunk():
            chunk_data = rest_prev_chunk+chunk
            if (DConcurrentRequest):
                self.InsertChunkHashesConcurrent(chunk_data, offset)
            else:
                self.InsertChunkHashes(chunk_data, offset)
            offset = offset + len(chunk_data) - self.KeySize
            rest_prev_chunk = chunk[-self.KeySize:]
            #print(offset),
            
        self.File.close()


    def InsertChunkHashes(self, chunk_data, offset):
        for key in range(0,len(chunk_data)-self.KeySize):
            self.InsertHashRow(chunk_data[key:key+self.KeySize],offset+key)
            
        
    def InitStatement(self):
        #self.prepared_sql = self.session.prepare("UPDATE "+ self.ReferenceName + "."+ DReferenceHashTableName +" set value = value + [%s] where seq=%s")
        #self.prepared_sql = self.session.prepare('UPDATE example.hash set value = value + [?] where seq=?')
        #session.execute("update blast.sequences set value = value + [%s] where seq =%s;", (listText,key))
        self.future = None

        
    def InsertHashRow(self, key, offset):
        if (DFutureCheck):
            if self.future:
                try:
                    results = self.future.result()
                    self.future = None
                except Exception:
                    log.exeception()
                    print("InsertHashRow::Error Checing asincronous insert")
        elif (len(self.Futures)>100):
            for query in self.Futures:
                try:
                    results = query.result()
                except Exception:
                    log.exeception()
                    print("InsertHashRow::Error Checing asincronous insert")            
            self.Futures=[]
        
        #print("Insert row {} {}".format(key,offset))
        #self.future = self.session.execute_async(self.prepared_sql.bind(offset, key))
        future = self.session.execute_async("UPDATE "+ self.ReferenceName + "."+ DReferenceHashTableName + " set value = value + [%s] where seq=%s",[str(offset), key])      
        if DFutureCheck:
            self.future = future
        else:
            self.Futures.append(future)
        if (DDebug):
            print("Insert row {} {}".format(key,offset))
        self.NumberKeys += 1 
        if (self.NumberKeys%1000)==0:
            print("Processed {} Keys in {} seconds.".format(self.NumberKeys,round(time() - self.StartTime,3)))


    def InsertChunkHashesConcurrent(self, chunk_data, offset):
        parameters = [ (chunk_data[key:key+self.KeySize], str(offset+key)) for key in range(0,len(chunk_data)-self.KeySize) ]
        self.InsertHashRowConcurrent(parameters)


    def InsertHashRowConcurrent(self, query_parameters):                       
        update_tatement = SimpleStatement( "UPDATE "+ self.ReferenceName + "."+ DReferenceHashTableName + " set value = value + [%s] where seq=%s",
                                            consistency_level=ConsistencyLevel.QUORUM)
        statements = cycle((update_tatement, ))

        results = execute_concurrent(self.session, list(zip(statements, query_parameters)), concurrency=DNumberConcurrentRequest,  raise_on_first_error=True)
        for (success, result) in results:
            if (not success):
                print("InsertHashRowConcurrent::Error in concurrent insert")

        if (DDebug):
            for (key,offset) in query_parameters:
                print("Insert row {} {}".format(key,offset))

        self.NumberKeys += len(query_parameters)
        if (self.NumberKeys%1000)==0:
            print("Processed {} Keys in {} seconds.".format(self.NumberKeys,round(time() - self.StartTime,3)))


    def CreateReferenceHashTable(self):
        self.CreateCassandraSession()
        self.SetLogger()
        self.CreateHashTable()
        self.WriteReferenceHashes()
        print("Done")
        print("Created {} Keys in {} seconds.".format(self.NumberKeys,round(time() - self.StartTime,3)))
    
        
## Testing 

if (DDoTesting):    
       
    # Test 1a: Calculate Query's keys & desplazaments (with header line)
    print("Test 1a: Create reference Hash")
    referenceFilename = '../Datasets/References/Example.txt'
    referenceName = str.lower("Example2")
    keySize = DKeySize
    obj = SparkBlast_CreateReferenceHash(referenceFilename, referenceName, keySize)
    obj.CreateReferenceHashTable()
    
    error

if __name__ == '__main__':

      ## Process parameters. (https://docs.python.org/2/library/argparse.html)
    ## SparkBlast_CreateReferenceHash <Reference_Files> [Key_size=11] [ReferenceName] [ContentBlockSize=1000]
    if (len(sys.argv)<2):
        print("Error parametes. Usage: SparkBlast_CreateReferenceHash <Reference_Files> [Key_size=11] [ReferenceName=ReferenceFileName].\n")
        sys.exit(1)

    referenceFilename = sys.argv[1]
    base = os.path.basename(referenceFilename).lower()
    referenceName, ext = os.path.splitext(base)    
    keySize = DKeySize
    if (len(sys.argv)>2):
        keySize = int(sys.argv[2])
    if (len(sys.argv)>3):
        referenceName = sys.argv[3]
    

    # Execute Main functionality
    print("{}({}, {}, {}).".format(sys.argv[0], referenceFilename, keySize, referenceName))
    
    obj = SparkBlast_CreateReferenceHash(referenceFilename, referenceName, keySize)
    obj.CreateReferenceHashTable()

2019-06-24 10:23:38,232 [INFO] cassandra.policies: Using datacenter 'dc1' for DCAwareRoundRobinPolicy (via host '192.168.1.1'); if incorrect, please specify a local_dc to the constructor, or limit contact points to local cluster nodes
2019-06-24 10:23:38,232 [INFO] cassandra.policies: Using datacenter 'dc1' for DCAwareRoundRobinPolicy (via host '192.168.1.1'); if incorrect, please specify a local_dc to the constructor, or limit contact points to local cluster nodes
2019-06-24 10:23:38,232 [INFO] cassandra.policies: Using datacenter 'dc1' for DCAwareRoundRobinPolicy (via host '192.168.1.1'); if incorrect, please specify a local_dc to the constructor, or limit contact points to local cluster nodes
2019-06-24 10:23:38,232 [INFO] cassandra.policies: Using datacenter 'dc1' for DCAwareRoundRobinPolicy (via host '192.168.1.1'); if incorrect, please specify a local_dc to the constructor, or limit contact points to local cluster nodes
2019-06-24 10:23:38,232 [INFO] cassandra.policies: Using dat

Test 1a: Create reference Hash


2019-06-24 10:23:42,311 [INFO] root: example2.hash Table Created !!!
2019-06-24 10:23:42,311 [INFO] root: example2.hash Table Created !!!
2019-06-24 10:23:42,311 [INFO] root: example2.hash Table Created !!!
2019-06-24 10:23:42,311 [INFO] root: example2.hash Table Created !!!
2019-06-24 10:23:42,311 [INFO] root: example2.hash Table Created !!!
2019-06-24 10:23:42,311 [INFO] root: example2.hash Table Created !!!
2019-06-24 10:23:42,311 [INFO] root: example2.hash Table Created !!!
2019-06-24 10:23:42,311 [INFO] root: example2.hash Table Created !!!
2019-06-24 10:23:42,311 [INFO] root: example2.hash Table Created !!!


Insert row 12345678901 0
Insert row 23456789012 1
Insert row 34567890123 2
Insert row 45678901234 3
Insert row 56789012345 4
Insert row 67890123456 5
Insert row 78901234567 6
Insert row 89012345678 7
Insert row 90123456789 8
Insert row 01234567890 9
Insert row 12345678901 10
Insert row 23456789012 11
Insert row 34567890123 12
Insert row 45678901234 13
Insert row 56789012345 14
Insert row 67890123456 15
Insert row 78901234567 16
Insert row 89012345678 17
Insert row 90123456789 18
Insert row 01234567890 19
Insert row 1234567890A 20
Insert row 234567890AB 21
Insert row 34567890ABC 22
Insert row 4567890ABCD 23
Insert row 567890ABCDE 24
Insert row 67890ABCDEF 25
Insert row 7890ABCDEFG 26
Insert row 890ABCDEFGH 27
Insert row 90ABCDEFGHI 28
Insert row 0ABCDEFGHIJ 29
Insert row ABCDEFGHIJK 30
Insert row BCDEFGHIJKL 31
Insert row CDEFGHIJKLM 32
Insert row DEFGHIJKLMN 33
Insert row EFGHIJKLMNO 34
Insert row FGHIJKLMNOP 35
Insert row GHIJKLMNOPQ 36
Insert row HIJKLMNOPQR 37
Insert row IJKLMNOPQRS

NameError: name 'error' is not defined