In [1]:
import pandas as pd
try:
    import cPickle as pickle
except ImportError:  # python 3.x
    import pickle
import logging

In [2]:
logging.basicConfig(format="%(asctime)s - %(thread)s - %(levelname)s - %(message)s")
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [3]:
def readHistoricalData(path='./dummyProductStat.csv',
                       delim = ',', storeId='SAMPLEHOTEL1234'):
    dummyProductStat = pd.read_csv(path, delimiter=delim)
    storeId = storeId.strip().upper()
    dummyProductStat = dummyProductStat[['PRODUCTIDNO', 'VENDORSTOREIDNO', 'CUSTOMERCATEGORY',
                                         'CUSTOMERCATEGORYVER', 'ALPHA', 'BETA', 'HIT', 'MISS']].copy()
    dummyProductStat['VENDORSTOREIDNO'] = dummyProductStat['VENDORSTOREIDNO'].apply(lambda x: x.strip().upper())
    dummyProductStat = dummyProductStat[dummyProductStat['VENDORSTOREIDNO']==storeId].copy()
    
    dummyProductStat['PRODUCTIDNO'] = dummyProductStat['PRODUCTIDNO'].apply(lambda x: x.strip().upper())
    dummyProductStat['CUSTOMERCATEGORY'] = dummyProductStat['CUSTOMERCATEGORY'].apply(lambda x: x.strip().upper())
    dummyProductStat['CUSTOMERCATEGORYVER'] = dummyProductStat['CUSTOMERCATEGORYVER'].apply(lambda x: float(x))
    dummyProductStat['ALPHA'] = dummyProductStat['ALPHA'].apply(lambda x: float(x))
    dummyProductStat['BETA'] = dummyProductStat['BETA'].apply(lambda x: float(x))
    dummyProductStat['HIT'] = dummyProductStat['HIT'].apply(lambda x: float(x))
    dummyProductStat['MISS'] = dummyProductStat['MISS'].apply(lambda x: float(x))
    
    return dummyProductStat

In [4]:
def readPicklefile(path):
    try:
        with open(path, 'rb') as fp:
            data = pickle.load(fp)
        return data
    except FileNotFoundError:
        logger.error ("Could not read the file at {}".format(path))
        raise FileNotFoundError("Could not read the file at {}".format(path))
        

In [5]:
class ClusterTreeNode:
    def __init__(self):
        self.operationCode = None
        self.attribute = None
        self.decidingValue = None
        self.clusterID = -1
        self.left = None
        self.right = None
        self.parent = None
        self.fractionInherited=0.
        self.attributeType = None
        
    def setAttribute(self, attribute, attributeType):
        self.attribute = attribute
        self._SetAttributeType(attributeType)
    
    def _SetAttributeType(self, attributeType):
        self.attributeType = attributeType
        
    def getAttributeType(self):
        return self.attributeType
        
    def getAttribute(self):
        return self.attribute
        
    def setValue(self, value):
        self.decidingValue = value
        self._setOperationCode(value)
    
    def getDecidingValue(self):
        return self.decidingValue
        
    def _setOperationCode(self, value):
        if isinstance(value, list) or isinstance(value, tuple):
            self.operationCode= 2 # Operation code stands for not in and in
        elif self.attributeType=='Categorical': # Operation Code Stands for != and ==
            self.operationCode= 3
        else:
            self.operationCode= 1 # operatopm code stand for <= and >
    
    def getOperationCode(self):
        return self.operationCode
    
    def isLeaf(self):
        if self.left==None and self.right ==None:
            return True
        else:
            return False
        
    def setRight(self, node):
        self.right = node
    
    def getRight(self):
        return self.right
    
    def getLeft(self):
        return self.left
    
    def setLeft(self, node):
        self.left = node
    
    def setParent(self, node):
        self.parent = node
    
    def getParent(self):
        return self.parent
    
    def setInheriatedFraction(self, value):
        if value>=0.0 and value<1.0:
            self.fractionInherited=value
        else:
            print("The fraction should be greater than 0 and less than 1!")
    def getInheriatedFraction(self):
        return self.fractionInherited
    
    def setClusterId(self, clusterId):
        self.clusterID = clusterId
        
    def getClusterId(self):
        return self.clusterID
    
    

class ClusterTree:
    def __init__(self, node, defaultVals, versionOfClusterAlgo):
        self.setRoot(node)
        self.defaultVals = dict((k.lower().strip().lower(), v) for k,v in defaultVals.items())
        self.versionOfClusterAlgo = versionOfClusterAlgo
        self.attributeToDataType = dict()
        
    def setRoot(self, node):
        assert isinstance(node, ClusterTreeNode) and node.parent is None
        self.root = node
        return
    
    def isRoot(self, node):
        if self.root is node:
            return True
        else:
            return False
    def getRoot(self):
        return self.root
        
    def getClusterID(self, info):
        ## Iterative 
        assert isinstance(info, dict), logger.error("ClusterTree.getClusterID() expects information to be passed in\
        python dictionary(key-value pair) format!")
        
        info = dict((k.lower().strip().lower(), v) for k,v in info.items())
        
        curNode = self.root
        clusterID = None
        while clusterID is None and curNode is not None:
            if curNode.isLeaf():
                clusterID = curNode.getClusterId()
                break
                
            val = curNode.getDecidingValue()
            if isinstance(val, str):
                val = val.strip().lower()
            
            opcode = curNode.getOperationCode()
            att = curNode.getAttribute()
            
            nodeVal = info.get(att, self.defaultVals[att])
            if isinstance(nodeVal, str):
                nodeVal = nodeVal.strip().lower()
            else:
                nodeVal = str(nodeVal)
            acceptedDtypes = self.attributeToDataType[att]
            flag = False
            for typ in acceptedDtypes:
                flag = flag or isinstance(nodeVal, typ)
            
            if not flag:
                if isinstance(nodeVal, str) and nodeVal.replace('.','',1).isdecimal():
                    nodeVal = float(nodeVal)
                    
            if opcode == 1:
                if isinstance(nodeVal, str):
                    # Remove this followig print statement if the inefernce model is not going to be deployed as AWS Lambda Function.
                    print ("Unexpected data for Attribute: {}, Passed Value: {}, Numerical data expected!".format(att, nodeVal))
                    nodeVal = self.defaultVals[att]
                if nodeVal<= val:
                    curNode = curNode.getLeft()
                else:
                    curNode = curNode.getRight()
            elif opcode == 2:
                if nodeVal not in val:
                    curNode = curNode.getLeft()
                else:
                    curNode = curNode.getRight()
            elif opcode == 3:
                if nodeVal != val:
                    curNode = curNode.getLeft()
                else:
                    curNode = curNode.getRight()
        
        return clusterID, self.versionOfClusterAlgo

In [17]:
class MigrationOfClusteringMechanism:
    def __init__(self, storeId, oldModelPath, newModelPath, representativeDataPath):
        self.inferenceModelPath_1=oldModelPath
        self.inferenceModelPath_2=newModelPath
        self.modelOld = readPicklefile(oldModelPath)
        self.modelNew = readPicklefile(newModelPath)
        self.recentModelSourceDataPath = representativeDataPath
        self.representativeData = self._sourceDataReader(self.recentModelSourceDataPath)
        self.storeId = storeId
        self.historicalProductStat =  readHistoricalData(storeId=self.storeId)
        
        pass
    
    def _sourceDataReader(self, path, delim=';'):
        # Snap shot of the data based on the latest segmentation has been performed. 
        # In case of static rule base segmentation, use a good reepresentation of the data.
        df = pd.read_csv(path, delimiter=delim)
        for col in df.columns:
            if col == 'Unnamed: 0' or col == 'unnamed: 0':
                df.drop(columns=col, inplace = True)
            elif isinstance(df[col][0], str):
                df[col] = df[col].apply(lambda x: x.strip().lower())

        newColumnNames =dict()
        for col in df.columns:
            newColumnNames[col] = col.strip().lower()
        df.rename(columns=newColumnNames, inplace=True)

        return df
    
    def _getMappingWeightsNewFromOldSegments(self):
        modelOld = self.modelOld
        modelNew = self.modelNew
        dataSnapShot = self.representativeData
        i = 0
        clusters = {}
        for k,v in dataSnapShot.to_dict(orient='index').items():
            oldCluster, oldVer = modelOld.getClusterID(v)
            newCluster, newVer = modelNew.getClusterID(v)
            cluster = [oldCluster, newCluster]
            clusters[i] = {'SegmentByOld': oldCluster,'OldVer':oldVer, 'SegmentByNew': newCluster, 'NewVer': newVer}
            i+=1
    
        oldVsNewClusters = pd.DataFrame.from_dict(clusters, orient='index')
        migrationCount = oldVsNewClusters.groupby(by=['SegmentByOld', 'OldVer', 'SegmentByNew', 'NewVer']).size().reset_index(name='counts')
        oldCount = pd.DataFrame(oldVsNewClusters['SegmentByOld'].value_counts().reset_index())
        oldCount.columns = ['SegmentByOld', 'OldOccurances']
        migrationCount = migrationCount.merge(oldCount, on=['SegmentByOld'])
        migrationCount['Fraction'] = migrationCount['counts']/migrationCount['OldOccurances']
        migrationDict = self._calcMappingNewFromOldSegments(migrationCount)
        
        desc = '''Structure of the 'Migration Driver Information': {('NewCluster_ID-X', 'NewCluster_Ver'):
{('OldCluster_ID-X', 'OldCluster_Ver-A'): #FractionOfTheOldCluster_ID-XDataWillBePartOfNewCluster_ID-X, 
(OldCluster_ID-Y', 'OldCluster_Ver-A'): #FractionOfTheOldCluster_ID-YDataWillBePartOfNewCluster_ID-X,
('OldCluster_ID-Z', 'OldCluster_Ver-A'): #FractionOfTheOldCluster_ID-ZDataWillBePartOfNewCluster_ID-X},
('NewCluster_ID-Y', 'NewCluster_Ver'): .............
      .......}'''
        logger.info (desc)
        
        logger.info("Migration Driver Information: {}".format(migrationDict))
        
        return migrationDict

    def _calcMappingNewFromOldSegments(self, migrationCount):

        migrationDict = {}
        for i in range(len(migrationCount)):
            SegmentByNew = migrationCount.loc[i, 'SegmentByNew']
            NewVer = migrationCount.loc[i, 'NewVer']
            key = (SegmentByNew, NewVer)

            if not key in migrationDict.keys():
                migrationDict[key] = {}
            val = migrationDict[key]
            SegmentByOld = migrationCount.loc[i, 'SegmentByOld']
            OldVer = migrationCount.loc[i, 'OldVer']
            valueKey = (SegmentByOld, OldVer)
            val[valueKey] = migrationCount.loc[i, 'Fraction']
            migrationDict[key] = val

        return migrationDict
    
    def _calcResultant(self, mapping, column, custCat, custCatVer, storeId, productId):
        
        custCat = custCat.strip().lower()
        custCatVer = float(custCatVer)
        key = (custCat, custCatVer)
        val = mapping[key]
        val = {tuple([i.strip().upper() if isinstance(i, str) else i for i in k]):v for k,v in val.items()}
        v = 0
        for k in val.keys():
            SegmentByOld = k[0]
            OldVer = k[1]
            try:
                calc_val = val[k] * self.historicalProductStat[(self.historicalProductStat['VENDORSTOREIDNO'] == storeId.strip().upper()) & 
                                                     (self.historicalProductStat['PRODUCTIDNO'] == productId.strip().upper()) &
                                                     (self.historicalProductStat['CUSTOMERCATEGORY'] == SegmentByOld) & 
                                                     (self.historicalProductStat['CUSTOMERCATEGORYVER'] == OldVer)][column].values[0]
            except Exception:
                
                calc_val = 0

            v += calc_val

        return v
    
    def transformOldDataAccToNewSegmentation(self):
        migrationDict = self._getMappingWeightsNewFromOldSegments()
        i = 0
        allRows = {}
        for pdId in set(self.historicalProductStat['PRODUCTIDNO']):
            for custCat, custCatVer in list(migrationDict.keys()):
                row = {'PRODUCTIDNO':pdId, 'VENDORSTOREIDNO': self.storeId, 'CUSTOMERCATEGORY': custCat.strip().upper(), 
                      'CUSTOMERCATEGORYVER': custCatVer}
                for column in ['ALPHA', 'BETA', 'HIT', 'MISS']:
                    row[column] = self._calcResultant(mapping = migrationDict, column = column,
                                                 custCat = custCat, custCatVer = custCatVer,
                                                 storeId = self.storeId, productId=pdId )
                allRows[i] = row
                i +=1
                
        newProductStat = pd.DataFrame.from_dict(allRows, orient='index')
        
        return newProductStat

In [18]:
storeId='SAMPLEHOTEL1234'
oldModelPath = './DynamicClusteringSampleHotelData-1.pickle'
newModelPath = './DynamicClusteringSampleHotelData-2.pickle'
representativeDataPath = './SampleHotelDataForTest-2.csv'

In [19]:
migrationOfClusteringMechanism = MigrationOfClusteringMechanism(storeId=storeId,
                                                                oldModelPath = oldModelPath,
                                                                newModelPath = newModelPath,
                                                                representativeDataPath = representativeDataPath)

In [20]:
newData = migrationOfClusteringMechanism.transformOldDataAccToNewSegmentation()

2019-09-06 14:41:20,588 - 140735688344384 - INFO - Structure of the 'Migration Driver Information': {('NewCluster_ID-X', 'NewCluster_Ver'):
{('OldCluster_ID-X', 'OldCluster_Ver-A'): #FractionOfTheOldCluster_ID-XDataWillBePartOfNewCluster_ID-X, 
(OldCluster_ID-Y', 'OldCluster_Ver-A'): #FractionOfTheOldCluster_ID-YDataWillBePartOfNewCluster_ID-X,
('OldCluster_ID-Z', 'OldCluster_Ver-A'): #FractionOfTheOldCluster_ID-ZDataWillBePartOfNewCluster_ID-X},
('NewCluster_ID-Y', 'NewCluster_Ver'): .............
      .......}
2019-09-06 14:41:20,588 - 140735688344384 - INFO - Migration Driver Information: {('samplehoteldata__cluster_4', 3.0): {('samplehoteldata__cluster_0', 2.0): 1.0, ('samplehoteldata__cluster_2', 2.0): 0.2903225806451613, ('samplehoteldata__cluster_7', 2.0): 0.07142857142857142}, ('samplehoteldata__cluster_0', 3.0): {('samplehoteldata__cluster_1', 2.0): 0.14285714285714285, ('samplehoteldata__cluster_2', 2.0): 0.5376344086021505}, ('samplehoteldata__cluster_1', 3.0): {('samplehot

In [21]:
newData

Unnamed: 0,PRODUCTIDNO,VENDORSTOREIDNO,CUSTOMERCATEGORY,CUSTOMERCATEGORYVER,ALPHA,BETA,HIT,MISS
0,PRODUCT11111111,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_4,3.0,17.421659,55.550691,21.506912,69.168203
1,PRODUCT11111111,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_0,3.0,9.384025,28.437788,11.425499,35.242704
2,PRODUCT11111111,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_1,3.0,13.551459,42.368664,16.639017,52.660522
3,PRODUCT11111111,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_3,3.0,41.711823,108.263547,48.842365,132.03202
4,PRODUCT11111111,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_2,3.0,23.039468,60.078106,27.007894,73.30619
5,PRODUCT11111111,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_5,3.0,3.891566,10.301205,4.578313,12.590361
6,PRODUCT11111112,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_4,3.0,28.315668,66.4447,32.400922,80.062212
7,PRODUCT11111112,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_0,3.0,14.827957,33.88172,16.869432,40.686636
8,PRODUCT11111112,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_1,3.0,21.784946,50.602151,24.872504,60.894009
9,PRODUCT11111112,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_3,3.0,60.726601,127.278325,67.857143,151.046798


In [22]:
historicalData = readHistoricalData() 

In [23]:
historicalData

Unnamed: 0,PRODUCTIDNO,VENDORSTOREIDNO,CUSTOMERCATEGORY,CUSTOMERCATEGORYVER,ALPHA,BETA,HIT,MISS
0,PRODUCT11111111,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_0,2.0,12.0,40.0,15.0,50.0
1,PRODUCT11111111,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_1,2.0,13.0,41.0,16.0,51.0
2,PRODUCT11111111,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_2,2.0,14.0,42.0,17.0,52.0
3,PRODUCT11111111,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_3,2.0,15.0,43.0,18.0,53.0
4,PRODUCT11111111,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_4,2.0,16.0,44.0,19.0,54.0
5,PRODUCT11111111,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_5,2.0,17.0,45.0,20.0,55.0
6,PRODUCT11111111,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_6,2.0,18.0,46.0,21.0,56.0
7,PRODUCT11111111,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_7,2.0,19.0,47.0,22.0,57.0
8,PRODUCT11111112,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_0,2.0,20.0,48.0,23.0,58.0
9,PRODUCT11111112,SAMPLEHOTEL1234,SAMPLEHOTELDATA__CLUSTER_1,2.0,21.0,49.0,24.0,59.0
