In [1]:
import os
import sys
sys.path.append('..')

In [2]:
from BasicDataPrep_V1 import *
from CLTreeModules import *
from DataPrepForSeasonalityDetectionUsingBookingTrend_V1 import *
from SeasonalityDetection_Discrete import *
from SeasonalityDetection_Continuous import *
from ReadWritePickleFile import *
from PruneTreeByConqueringNodes import *
from PruneTreeByMergingCentroids import *
from ClusterTree_Utility import *
from Attribute import *
from ClusteringFromRules import *
from ClusterTree import *

In [3]:
def getSimpleAttributes(node, othersGroup):
    # Assert node is an instance of CLNode
    assert isinstance(node, CLNode)
    
    attributes = {}
    i = 0 
    while i<len(node.dataset.attr_names):
        attr_name = node.dataset.attr_names[i]
        uniqueValues = len(np.unique(node.dataset.getInstances(attr_name)))
        
        attributes[node.dataset.attr_names[i]]= Attribute(name=attr_name,
                                                          noOfUniqueValues = uniqueValues,
                                                          maxVal=node.dataset.max_values[i+1],
                                                          minVal = node.dataset.min_values[i+1])


        i+=1

    for key in attributes.keys():
        if attributes[key].type == 'Calculated' and re.findall("_others$", attributes[key].name)\
                and attributes[key].originalAttribute in othersGroup:
            attributes[key].setOriginalAttributeVal(othersGroup[attributes[key].originalAttribute])
        elif attributes[key].type == 'Calculated' and re.findall("_weekend$", attributes[key].name):
            attributes[key].setOriginalAttributeVal(['friday', 'saturday', 'fri', 'sat'])
        elif attributes[key].type == 'Calculated' and re.findall("_midweek$", attributes[key].name):
            attributes[key].setOriginalAttributeVal(['tuesday','wednesday', 'thursday','tue', 'wed', 'thu'])
        elif attributes[key].type == 'Calculated' and re.findall("_earlyweek$", attributes[key].name):
            attributes[key].setOriginalAttributeVal(['sunday', 'monday', 'sun', 'mon'])
        
            
    return attributes

def buildSimpleTree(CLNode, node, attributes):
    #Check if ClusterTreeNode is None and CLNode is Root
    curNode = ClusterTreeNode()
    
    if CLNode.includedInCluster:
        curNode.setParent(node)
        if CLNode.parent is None:
            curNode.setInheriatedFraction(1.0)
            if CLNode.clusterId is not None:
                curNode.setClusterId(CLNode.clusterId.strip().lower())
            else:
                curNode.setClusterId('DEFAULT')
        else:
            curNode.setInheriatedFraction(CLNode.dataset.length()/CLNode.parent.dataset.length())
            curNode.setClusterId(CLNode.clusterId.strip().lower())
        
    
    else:
        
        attribute = CLNode.attribute
        cutValue = CLNode.cutValue

        if attributes[attribute].type == 'Categorical' or attributes[attribute].type == 'Calculated':
            curNode.setAttribute(attribute=attributes[attribute].originalAttribute,
                                 attributeType=attributes[attribute].type)
            curNode.setValue(attributes[attribute].originalAttributeVal)

        else:
            curNode.setAttribute(attribute=attributes[attribute].originalAttribute, attributeType=attributes[attribute].type)
            curNode.setValue(cutValue)
            
        if CLNode.depth>0:
            curNode.setInheriatedFraction(CLNode.dataset.length()/CLNode.parent.dataset.length())
        else:
            curNode.setInheriatedFraction(0.)
        curNode.setParent(node)
        CLNodeChildren = CLNode.getChildNodes()
        left = buildSimpleTree(CLNodeChildren[0], curNode, attributes)
        curNode.setLeft(left)
        right = buildSimpleTree(CLNodeChildren[1], curNode, attributes)
        curNode.setRight(right)
        
    return curNode


def __validateMinMaxCriteria(min_y, min_split_fraction, data_length,
                             fractionOfTotalData, mergingCentroidsVsconqueringNodes, testMode):
    if testMode:
        return min_y, min_split_fraction
    
    assert fractionOfTotalData <=1.0 and fractionOfTotalData > 0.,\
        logger.error("The value of 'fractionOfTotalData' should be greater than 0 and less than or equal to 1.")
    if mergingCentroidsVsconqueringNodes:
        initial_override_min_y = 1000.0
    else:
        initial_override_min_y = 750.0
    
    if data_length > 5000.0:
        if fractionOfTotalData < 1.0:
            initial_override_min_y = math.floor(initial_override_min_y * fractionOfTotalData)
            
    else:
        if fractionOfTotalData < 1.0:
            initial_override_min_y = math.floor(initial_override_min_y * fractionOfTotalData * (data_length/5000))
    
    min_y = max(initial_override_min_y, min_y)
    
    if not mergingCentroidsVsconqueringNodes:
        min_split_fraction = min(min_split_fraction, (100/data_length))
    else:
        min_split_fraction = min(min_split_fraction, (min_y/data_length))
    
    logger.info("The min-split fraction has been set to {}".format(min_split_fraction))
    logger.info("The min_y (The minimum number of member required for a cluster) has been set to {}".format(min_y))
    
    return min_y, min_split_fraction


def __combine(monthsToTree, keys, attribute, defaultValues, curVer):
    if len(keys) == 1:
        key = keys[0]
        return monthsToTree[key]
        
    curNode = ClusterTreeNode()
    mid = math.ceil((len(keys)-1)/2)
    keys_l = keys[:mid]
    keys_r = keys[mid:]
    subtree_l = __combine(monthsToTree,keys_l, attribute, defaultValues, curVer)
    subtree_r = __combine(monthsToTree,keys_r, attribute, defaultValues, curVer)
    cutValue = ()
    for i in keys_r:
        cutValue = cutValue + i
    curNode.setAttribute(attribute=attribute, attributeType='Calculated')
    curNode.setValue(list(cutValue))
    curNode.setLeft(subtree_l.getRoot())
    subtree_l.getRoot().setParent(curNode)
    curNode.setRight(subtree_r.getRoot())
    subtree_r.getRoot().setParent(curNode)
    clusterTree = ClusterTree(curNode, defaultValues, curVer)
    return clusterTree


def __getClusterTreeFromData(data, categoricalAttributes, mergingCentroidsVsconqueringNodes, balancedPrune,
                             min_y, min_split_fraction, conquerDataColumns, prefixString, defaultValues,
                             useSilhouette, prevVer):
    
    if categoricalAttributes is not None:
        othersGroup, data = getOthersGoup(data, categoricalAttributes, min_split_fraction)
    else:
        othersGroup = None

    data.reset_index(inplace=True, drop=True)

    if not mergingCentroidsVsconqueringNodes:
        if conquerDataColumns is not None:
            conquerData = pd.DataFrame(data[conquerDataColumns].copy(deep=True))
            data = data[list(set(data.columns)-set(conquerDataColumns))].copy(deep=True)
        else:
            conquerData = data.copy(deep=True)
            
        divideData = data.copy(deep=True)
        
                
    d_var = dict(data.var())
    d_cols = sorted(d_var, key=d_var.get, reverse=True)
    data = data[d_cols]


    r = DataFrameReader(data)
    data = r.read()
    min_split = np.ceil(data.length() * min_split_fraction)
    cltree = CLTree(data, min_split)
    cltree.buildTree()
    min_y = max(min_y, min_split)
    data_length = data.length()

    if mergingCentroidsVsconqueringNodes:
        result, baseVer = pruneByGridSearch_Centroid(cltree, min_y, data_length, prefixString, balancedPrune, useSilhouette)
    else:
        gradientTolerance = 0.01 # Tested with different values, looks like 0.01 is a good candidate
        result, baseVer = pruneByGridSearch(cltree, min_y, prefixString, gradientTolerance, conquerData, divideData, useSilhouette)
    
    utc_now = datetime.datetime.now()
    deltaVer = math.ceil(float((utc_now - datetime.datetime(1970, 1, 1)).total_seconds()))
    deltaVer = float("." + str(deltaVer))
    
    if math.floor(prevVer) == math.floor(baseVer):
        curVer = baseVer + deltaVer
    else:
        curVer = baseVer
    
    attributes = getSimpleAttributes(node=cltree.root, othersGroup=othersGroup)
    simpleTreeRoot = buildSimpleTree(CLNode=cltree.root, node = None, attributes=attributes)            
    clusterTree = ClusterTree(simpleTreeRoot, defaultValues, curVer)
    return clusterTree

In [4]:
# Option can take values between (1,2,3)
# Option 1 (option = 1)==> Provide Static Rule based Clustering. Please provide the Clustering Rules in a
# Json|dictionary type Format. For reference consult DemoRuleBasedSegmentation.xlsx for clear and
# better understanding of the rules, then writePickleFile.ipynb to see how to write that rule into .json| python dict. 
# Finally you can read the staticClustering.pickle to see the end product which will be used in cluster model building. 
# You will be able to see the visualization of the end product in RuleBasedClusterTree.gv.pdf. 

# Option 2 (option = 2)==> Dynamic Clustering WITHOUT taking into account "Seasonality" effect. Please provide the
# historical data in a ".csv" file with "delimiter"(another argument) specified. ";" is preferred as delimeter.

# Option 3 (option = 3)==> Dynamic Clustering with "Seasonality". Please provide the historical 
# data in a ".csv" file with "delimiter"(another argument) specified. ";" is preferred as delimeter.
# Please consult the defination of "Seasonality" explained later in this section to better understand the task.
# If you are choosing this option provide a viable value for "keyStringForSeasonality". 

# destinationPath ==> Is the path where the final model would be stored and later referenced at the transaction time. 
# We should have read and write access to this path and this path should be accessible from "training" and "inference"
# location both. This is a path including the file name with extension(preferably .pickle).


# staticClusterFilePath ==> If you have chosen "option = 1". This is path-variable to specify the location where
# the static rules are stored in ".json|pickle etc." format. We should have read access from this location. 
# This is a path including the file name with extension(preferably .pickle).

# storeID ==> Preferably a string which identifies uniquely the entity for which we are running this algorithm. 
# This value will be used in naming the cluster ids at the end. 

# histrocalDataFilePath ==> If you have chosen option = 2 or option = 3, this is the path where you specify where the
# historical data file is stored. The historical data file is the file which holds the historical data, based on which
# the segmenetations would be created and later will be used for inference. This file should be in ".csv" format and 
# delimited by the sysmbol mentioned in "delimiter". ";" is preferred as delimeter. We should have read access from this
# location. This is a path including the file name with extension.

# categoricalAttributes ==> This variable takes the name of columns from the historical data which are "categorical"
# in nature. Categorical variables take on values that are names, labels, ids, codes etc.
# Ex: Channel_id, language_id, Channel_name, language_name, language_id etc.
# In case of historical data-based approach(option =2 or option =3), if you have some attribute which are categorical
# in nature, mention them using “categoricalAttributes” variable. It can take “None” or one or multiple values.
# Please assign its value based on your data at hand.EX: Suppose there is only one categorical variable(column)
# you have in your data is “language_id”. Then, categoricalAttributes = “’language_id’”
# Suppose there are two categorical variables(columns) you have in your data, lang_id and channel_id. Then,
# categoricalAttributes = “[’lang_id’, ’channel_id’]”. Now suppose there are no categorical variable in your data,
# then assign categoricalAttributes = “None”
        
# Seasonality Related Variable Definations:
# (manualSeasonality, seasonalityDataPath, continuousVsDiscrete, keyStringForSeasonality)
#    **Seasonality:** Grouping together few "keyStringForSeasonality" months. 
#     "keyStringForSeasonality" is something which is externally provided.
#     "keyStringForSeasonality" should take value between **("arrival", "booking", "stay" etc.)**
#     suppose you want to do seasonality based on "arrival_month", mention "keyStringForSeasonality" = "arrival"
#     and the data should have "arrival_month"|"month_of_arrival"|"arrivalmonth" etc. in the data colums by the
#     time of training.
#     If for some reason, you have mention something in "keyStringForSeasonality" but there is no column in data
#     which associates with "keyStringForSeasonality" + "month" irrespective of their position,
#     the algorithm will throw an error at the time of training. 
   
#     Same for any other string you mention as "keyStringForSeasonality". How seasnality happens in a 
#     hotel/property/entity , it depends on the perpective of the people in-charge, how they look at their customer base.
#     Also, we have to make sure that we apply seasonality based on something which we can pass at the backend at the time
#     of transaction. Let's say, if "stay" is the keySting and we do not have ability to pass "stay_month" at the time
#     of transaction going on, then we should not apply seasonality for that property. But if you think about it,
#     "arrival" is close to "stay" so, if it is okay with everyone, and the we can replace the "stay" with "arrival"
#     and we can pass "arrival_month" to the backend by the time of live transaction going on, then we should apply
# "arival" as "keyString" by the time of training and inference. 
#     **In general as well, be sure that you are using the same variable name as the time of training vs inference
#     otherwise the algotith willl not be able to recognise it and will assume default value for an attribute whenever
#     necessary.**
      
#    **Continous Seasonality:** Grouping together months by similarity among them by some criteria which are only adjacent to each other.
   
#    **Discerete Seasonality:** Grouping together months by similarity among them by some criteria irrespective of adjacency.
   
# The way seasonality detection algorithm is designed, it is stateless. Only assumption it makes, 
# each row is representation is one month and the rows are sorted by month. Ex.: January at row 0 and
# December at row 11 etc.
# **"option" == 3 means you are asking for to detect seasnality from the data** and then create customer segmentation
# for each season or group of months. 
# **"continuousVsDiscrete" is a boolean(True == Continuous, False == Discrete) flag**,
# based on which "continous" or "discrete" seasonality detection gets triggered.
# The default value for this flag is "True" (or Continuous). 

# Alternatively, suppose you want to **detect seasonality based on a different dataset than dataset used for core segmentation.** Ex. You have been provided a sales statistics and based on that we have to detect seasnality and once we have groupped together months based on the separate data, we will continue to our task of segmentation for each group of month based on the primary data. **"seasonalityDataPath" is the variable to provide path of secondary datasource to detect seasonality from it.**

# In addition, one can choose to provide **"manualSeasonality". Ex:  manualSeasonality = [[1,2,3,4], [5,12,7], [6,8,9,10,11]]**

# This customer segmentation solution is a 2 part solution. 1st part is essentially dividing
# the whole data into small groups and 2nd part is bringing the small groups together to make managable size groups,
# which is essentially conquering part of the algorithm. dividing parts happens on the data which avilable from the 
# session at the time of transaction but the conquering part can happen on any data it will depend on the
# problem stament, business logic, data availability etc.

# min_split_fraction ==> The value hold by this variable gets applied at the "1st" stage/operation("dividing").
# This fraction will detect how much granulaity you want to maintain in the samll groups. 
# Lets say, you have 100 records and you want full granularity, what it means is, if there is way to separate 2
# data points logically by some attributes, do it till there is only 1 member left each node. In this case you
# should mention your min_split_fraction = 0.01. On the other hand let's say, you are happy if a small group contains
# less than (or equal to) 10 people. In that case, the "dividing" algorithm will stop splitting a dataset(subset)
# further if it has total <=10 data points. 2 things you have to keep in mind when choosing a good value for
# min_split_fraction. 
# 1. This algorithm pipeline has a dedicated conquer part, so you should maintain a good level of granularity, so that
# the conquering algorithm can 2 similiar small groups easily. 
# 2. Too much granularity is not good always, it might have 2 adverse effect, i. The depth of the tree (search space)
# would be larger than desiered and it will take more time to traverse the tree to get the segmentation label/id,
# ii. Too much granularity can magnify noise and/or outliers in individual groups, as a result the conquering algorithm 
# might not perform as better as it could. 

# What would be good choice? Suppose you have 10000 data rows, group of 10 people is a good goto choice.
# If you have 15000 data rows, group of 15 people is a good goto choice. So a potential good value for
# min_split_fraction could be 0.001. But if you have only 5000 data rows, you might still want to maintain 10 people group 
# in that case the value of min_split_fraction would be 0.002. 
 
# whatFractionOfLast12MonthBySDBookings ==> This is fraction indicate what fraction of last 12 month's booking data is
# getting used to represent the whole booking data. It takes a default value 1., which means we are passing
# whole booking data. Now suppose let's say, we don't have PMS synch with one property and approx 50% of their booking
# come from PMS, in that can we are only using 50% of the total booking data to represent their whole booking data,
# in this case the value of this fraction should be 0.5. 


# mergingCentroidsVsconqueringNodes ==> This is a flag to indicate which 'conquering' algorithm will be spawned. If the
# value of this flag is True, the "conquering" will happen by algorithm written in "PruneTreeByMergingCentroids". On the
# other hand if the value of the flag is False, the "conquering" will happen by algorithm written in 
# "PruneTreeByConqueringNodes". Watch the 'inference' notebook to get a better understanding of difference between
# two mechanisms. Both are valid and have performed well on all the testcases. Machine Learning is emprirical process,
# it is not always easy to choose between 2 well performing algorithms specially in case of unsupervised learning
# algorithms. 
# Here is my recommendation: 
#     1. If we are using the same dataset(same columns) for dividing and conquering or you are at the begining
#     of conducting an experimental study without prior knowledge, use "PruneTreeByMergingCentroids". 
#     2. If we are using the different dataset(different columns) for dividing and conquering definately use 
#     "PruneTreeByConqueringNodes"

# balancedPrune ==> This is a boolean flag to indicate if we want to have a balanced segmentation interms of number
# of members at each segment. Generally our goal when doing segmentation is we want to keep similar things together and
# different things far-apart. If the value of this flag is False, then we will stick to our original motivation and
# we will pick the one scenario which fits to our original goal the best but the value of this flag is True, that means
# we also care about the variations of the number data members in each group. We still care about the "purity" of each 
# group but we have an additional constraint here which tries to make the variations in the number of data memebers 
# among all the clusters as smalll as possible. 
# ** In both cases, we have a satifying constraint which is, we want atleast a certain number of members in each cluster. 

# conquerDataColumns ==> If we are using "PruneTreeByConqueringNodes" to conquering the small groups to bring them 
# together and assign a label/id to them by some criteria, we have choice to use the same columns as we have
# already used to divide the whole dataset or we can use a different set of columns to do the same job
# [THIS IS WHAT MAKES THIS ALGORITHM UNIQUE AND SPECIAL]. When we are using a different set of data to bring small
# groups together please mention columns will be used in "conquer". The columns will be used in "conquer" will not
# be used in "divide". If you are planning to use the same set of data columns to do the both task, please do not
# mention anything here and pass the value as null/None. 

# min_y ==> This is satisfying constraint of the conquering part. The min_y takes real value to indicate how many
# data points atleast we would need to qualify a group as a cluster. 
# useSilhouette ==> Is a Binary flag variable, default of it is False. If the value of this flag is True then the
# "Silhouette" constant indicator will be claculated to determine the optimal number of clusters from a dataset 
# otherwise avg. intra-cluster distance/avg. inter-cluster distance will be used to determine the optimal number
# of clusters from a dataset. 

In [5]:
def buildSimpleClusterTree(option=1, destinationPath=None, staticClusterFilePath=None,
                           storeID=None, histrocalDataFilePath=None, categoricalAttributes=None,
                           manualSeasonality=None, seasonalityDataPath=None, continuousVsDiscrete=True,
                           keyStringForSeasonality='arrival', mergingCentroidsVsconqueringNodes = True, 
                           balancedPrune=False, conquerDataColumns=None,
                           min_y=None, min_split_fraction=None, whatFractionOfLast12MonthBySDBookings=None,
                           previousVersion=-1, missingDataTolerance=0.90, delimiter= ';',
                           useSilhouette=False, testMode=False):
    
    if min_y is None:
        min_y = 1000.0
    if min_split_fraction is None and option == 2:
        min_split_fraction = 0.001
    elif min_split_fraction is None and option == 3:
        min_split_fraction = 0.010
        
    if whatFractionOfLast12MonthBySDBookings is None:
        whatFractionOfLast12MonthBySDBookings = 1.
        
    assert whatFractionOfLast12MonthBySDBookings>0. and whatFractionOfLast12MonthBySDBookings<=1.,\
    logger.error("The value of 'whatFractionOfLast12MonthBySDBookings' should be between 0(exclusive) and 1(inclusive)")
    assert destinationPath is not None, logger.error("Please provide a destination path to save the result!")
    assert (option == 1 and staticClusterFilePath is not None) or \
           (option in (2, 3) and histrocalDataFilePath is not None),\
        logger.error("Please provide the appropriate source file path and try again!")

    assert option in (1, 2, 3), logger.error("Please provide correct value as option. Acceptable values are 1,2,3!")
    
    if previousVersion is None:
        previousVersion = -1
        
    if option == 1:
        
        clusterTree = getClusterTreeFromRules(staticClusterFilePath, previousVersion)
        
    elif option in (2, 3):
        if categoricalAttributes is not None:
            if isinstance(categoricalAttributes, list):
                categoricalAttributes = [i.lower().strip() for i in categoricalAttributes]
            elif isinstance(categoricalAttributes, str):
                categoricalAttributes = categoricalAttributes.lower().strip()
        
        if conquerDataColumns is not None:
            assert isinstance(conquerDataColumns, list), "'conquerDataColumns' needed to be provided in a list format!"
            conquerDataColumns = [i.lower().strip() for i in conquerDataColumns]

        else:
            if not mergingCentroidsVsconqueringNodes:
                logger.warning("As 'conquerDataColumns' has not been provided,\
                                the algorithm will reuse the data used for creating the initial tree!")

        if keyStringForSeasonality is not None:
            keyStringForSeasonality = keyStringForSeasonality.strip().lower()
            
        df, keyStringDateCol, keyStringWeekdayCol, keyStringMonthCol, defaultValues = \
            getData(histrocalDataFilePath, categoricalAttributes, keyStringForSeasonality,
                            conquerDataColumns, missingDataTolerance, delimiter=delimiter)

        if categoricalAttributes is not None and isinstance(categoricalAttributes, list) \
                and len(categoricalAttributes) > 0:

            assert (set(categoricalAttributes).issubset(set(df.columns))),\
                logger.error("The name of categorical attributes don't match with data column names!")

        elif categoricalAttributes is not None and isinstance(categoricalAttributes, str):
            assert categoricalAttributes in list(df.columns), \
                logger.error("The name of categorical attribute doesn't match with data column names!")
        
        if option == 3:
            if manualSeasonality is not None:
                assert isinstance(manualSeasonality, dict) and\
                    all(isinstance(i, list) for i in list(manualSeasonality.values())) and \
                    len(manualSeasonality) <= 3 and len([j for i in list(manualSeasonality.values()) for j in i]) == 12\
                       and max([j for i in list(manualSeasonality.values()) for j in i]) == 12 and\
                        max([[j for i in list(manualSeasonality.values()) for j in i].count(x) for x in \
                            set([j for i in list(manualSeasonality.values()) for j in i])]) == 1,  \
                    logger.error("Please provide manual seasonality in correct format! The 'manualSeasonality'\
                                takes only 'dict'/key-value pair, where values are list of numerical value of months!\
                                One month can appear in only one season but one season can have any number of months!")
                
                clusters = manualSeasonality
                
            else:
                if seasonalityDataPath is not None:
                    assert keyStringMonthCol is not None, logger.error("To apply seasonality to data\
                                            we would need to provide **Relevant Month related information!")
                    record = readSeparateSeasonalityDetectionData(sourcePath = seasonalityDataPath, delim=',')

                else:
                    assert keyStringWeekdayCol is not None and keyStringMonthCol is not None,\
                        logger.error("To detect seasonality we would need to provide"
                                     " **Relevant Month and Weekday related information!")
                    seasonalityData  = df.copy(deep=True)
                    seasonalityData, _map, bucketColName = bucketizeLeadDays(seasonalityData,
                                                                             leadDaysColName='leaddays')
                    record = dataPrepForClusteringByBookingTrend(seasonalityData, _map, keyStringDateCol,
                                                                keyStringWeekdayCol, keyStringMonthCol,
                                                                bucketColName='LeadDays_Bucket',
                                                                groupByHowDissimilarToOthersTo=False)

                if continuousVsDiscrete:  # True means we have been asked to provide continuous seasonality.
                    if keyStringDateCol is not None:
                        df_copy = df.copy(deep = True)
                        instanceCountByMonth = df_copy[[keyStringDateCol, keyStringMonthCol]].\
                                                                drop_duplicates().reset_index(drop=True)
                        del df_copy
                        instanceCountByMonth = dict(instanceCountByMonth[keyStringMonthCol].value_counts())
                    else:
                        instanceCountByMonth = {1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30, 7: 31, 8: 31, 9: 30, 10: 31,
                                                11: 30, 12: 31}

                    clusters = ClusterMonths_Continuous(ds_EachRowADataPoint=record,
                                                        instanceCountByEachDataPoint=instanceCountByMonth,
                                                        acceptableNumberOfClusters=[4, 5, 6])
                else:  # False means if is okay to have group of months which are not adjacent to each other.
                    clusters = ClusterMonths_Discrete(record, plot=False)

                del record

            if keyStringDateCol is not None:
                df.drop(columns=keyStringDateCol, inplace=True)
            logger.info("Cluster of months: {}".format(clusters))
            monthsToTree = {}
            curVer = previousVersion
            for c in clusters:
                prefixString = ''.join([str(i) for i in clusters[c]])
                prefixString = str(storeID) + '_' + prefixString + '_'
                data = pd.DataFrame(df[df[keyStringMonthCol].isin(clusters[c])])
                data.drop(columns=keyStringMonthCol, inplace = True)
                
                dataLength = len(data)
                fractionOfTotalData = whatFractionOfLast12MonthBySDBookings

                min_y, min_split_fraction = __validateMinMaxCriteria(min_y, min_split_fraction,
                                                                     dataLength, fractionOfTotalData,
                                                                     mergingCentroidsVsconqueringNodes, testMode)

                tree = __getClusterTreeFromData(data, categoricalAttributes, mergingCentroidsVsconqueringNodes,
                                                balancedPrune, min_y, min_split_fraction, conquerDataColumns,
                                                prefixString, defaultValues, useSilhouette, previousVersion)

                monthsToTree[tuple(clusters[c])] = tree
                curVer = max(curVer, tree.versionOfClusterAlgo)
                
            attribute = keyStringMonthCol.strip().lower()
            keys = list(monthsToTree.keys())
            clusterTree = __combine(monthsToTree, keys, attribute, defaultValues, curVer)
            
        else:
            
            prefixString = str(storeID) + '_'

            if keyStringDateCol is not None:
                df.drop(columns=keyStringDateCol, inplace = True)
            if keyStringMonthCol is not None:
                df.drop(columns=keyStringMonthCol, inplace = True)
                
            dataLength = len(df)
            fractionOfTotalData = whatFractionOfLast12MonthBySDBookings
            min_y, min_split_fraction = __validateMinMaxCriteria(min_y, min_split_fraction, dataLength,
                                                                 fractionOfTotalData,
                                                                 mergingCentroidsVsconqueringNodes, testMode)

             
            clusterTree = __getClusterTreeFromData(df, categoricalAttributes, mergingCentroidsVsconqueringNodes,
                                                   balancedPrune, min_y, min_split_fraction,conquerDataColumns,
                                                   prefixString, defaultValues, useSilhouette, previousVersion)

    attributes, clusters = getAttributesAndClusters(clusterTree)
    flag = writePickleFile(path=destinationPath, data=clusterTree)
    if not flag:
        logger.error("The process has failed! Please Try Again!")
        return None
    else:
        logger.info("The process is successful!")
        return attributes, clusters


## Test - 1

In [6]:
categoricalAttributes = None
histrocalDataFilePath = 'PPTData_WithProduct.csv'
conquerDataColumns = ['Product-A', 'Product-B']

destinationPath = './PPTData_WithProduct.pickle'
storeID = 'PPTData_Model_w/o_products'
continuousVsDiscrete = None
keyStringForSeasonality = None
balancedPrune = False
mergingCentroidsVsconqueringNodes = False
min_y = 3.
min_split_fraction = 0.1
whatFractionOfLast12MonthBySDBookings= 1.
previousVersion=2.0


In [7]:
attributes, clusters = buildSimpleClusterTree(option=2, destinationPath=destinationPath, staticClusterFilePath=None,
                           storeID=storeID, histrocalDataFilePath=histrocalDataFilePath, categoricalAttributes=categoricalAttributes,
                           manualSeasonality=None, seasonalityDataPath=None, continuousVsDiscrete=continuousVsDiscrete,
                           keyStringForSeasonality='arrival',mergingCentroidsVsconqueringNodes = mergingCentroidsVsconqueringNodes,
                           balancedPrune=balancedPrune, conquerDataColumns=conquerDataColumns,
                           min_y=min_y, min_split_fraction=min_split_fraction,
                           whatFractionOfLast12MonthBySDBookings=whatFractionOfLast12MonthBySDBookings,
                           previousVersion=previousVersion, missingDataTolerance=0.90,delimiter=';', testMode=True)

2020-04-30 13:08:44,627 - 4533695936 - INFO - All columns after feature engineering and before dropping any: ['product-a', 'languageenglish', 'product-b', 'languagespanish', 'leaddays', 'numberofpeople']
2020-04-30 13:08:44,632 - 4533695936 - INFO - read 9
2020-04-30 13:08:44,633 - 4533695936 - INFO - attribute names: [('id', <class 'float'>), ('leaddays', <class 'float'>), ('numberofpeople', <class 'float'>), ('languagespanish', <class 'float'>), ('languageenglish', <class 'float'>)]
2020-04-30 13:08:44,634 - 4533695936 - INFO - This is the starting point!
2020-04-30 13:08:44,637 - 4533695936 - INFO - At this level the best cut is found on Attribute: leaddays, at: 25.0
2020-04-30 13:08:44,638 - 4533695936 - INFO - At previous level(0), the cut happened on Attribute:leaddays, Value: 25.0
2020-04-30 13:08:44,641 - 4533695936 - INFO - At this level the best cut is found on Attribute: leaddays, at: 14.0
2020-04-30 13:08:44,641 - 4533695936 - INFO - At previous level(1), the cut happened o

2020-04-30 13:08:44,726 - 4533695936 - INFO - Printing MergeList: []
2020-04-30 13:08:44,726 - 4533695936 - INFO - Printing Touching Nodes: {}
2020-04-30 13:08:44,735 - 4533695936 - INFO - Min_Y: 3.0, Scenario: dict_values([3, 4, 2]),  #Clusters: 2.99, Inv-Purity: 0.0
2020-04-30 13:08:44,736 - 4533695936 - INFO - Min_Y: 3.0, Scenario: dict_values([4, 5]),  #Clusters: 2, Inv-Purity: 0.36550402849608316
2020-04-30 13:08:44,737 - 4533695936 - INFO - Searching for 'min_y'. Current 'min_y' = 3.0, 'result' = 
{'intra-cluster-distance': 0.0, 'inter-cluster-distance': 0.981648190353195, 'purity': 0.0, 'data-points': {'PPTData_Model_w/o_products__cluster_0': 3, 'PPTData_Model_w/o_products__cluster_1': 4, 'PPTData_Model_w/o_products__cluster_DEFAULT': 2}}
2020-04-30 13:08:44,738 - 4533695936 - INFO - Printing MergeList: []
2020-04-30 13:08:44,738 - 4533695936 - INFO - Printing Touching Nodes: {}
2020-04-30 13:08:44,747 - 4533695936 - INFO - Min_Y: 3.0, Scenario: dict_values([3, 4, 2]),  #Cluster

In [8]:
clusters

{'pptdata_model_w/o_products__cluster_0',
 'pptdata_model_w/o_products__cluster_1',
 'pptdata_model_w/o_products__cluster_default'}

In [9]:
model1 = readPicklefile('./PPTData_WithProduct.pickle')
df = pd.read_csv('PPTData.csv', delimiter=";")
preds = []
for i in range(len(df)):
    info = dict(df.loc[i])
    pred, _ = model1.getClusterID(info) 
    preds.append(pred)
df['Predicted_Cluster'] = preds

In [10]:
df

Unnamed: 0,LeadDays,NumberOfPeople,LanguageEnglish,LanguageSpanish,Predicted_Cluster
0,10,1,0,1,pptdata_model_w/o_products__cluster_default
1,12,2,0,1,pptdata_model_w/o_products__cluster_default
2,35,1,1,0,pptdata_model_w/o_products__cluster_0
3,25,2,1,0,pptdata_model_w/o_products__cluster_1
4,1,1,1,0,pptdata_model_w/o_products__cluster_0
5,7,2,1,0,pptdata_model_w/o_products__cluster_1
6,3,2,1,0,pptdata_model_w/o_products__cluster_1
7,14,3,0,1,pptdata_model_w/o_products__cluster_1
8,5,1,1,0,pptdata_model_w/o_products__cluster_0


In [11]:
pd.read_csv('PPTData_WithProduct.csv', delimiter=";")

Unnamed: 0,LeadDays,NumberOfPeople,LanguageEnglish,LanguageSpanish,Product-A,Product-B
0,10,1,0,1,0,1
1,12,2,0,1,0,1
2,35,1,1,0,1,1
3,25,2,1,0,1,0
4,1,1,1,0,1,1
5,7,2,1,0,1,0
6,3,2,1,0,1,0
7,14,3,0,1,1,0
8,5,1,1,0,1,1


## Test - 2

In [12]:
categoricalAttributes = None
histrocalDataFilePath = 'PPTData.csv'
conquerDataColumns = None

destinationPath = './PPTData_Model.pickle'
storeID = 'PPTData_Model'
continuousVsDiscrete = None
keyStringForSeasonality = None
balancedPrune = False
mergingCentroidsVsconqueringNodes = True
min_y = 3.
min_split_fraction = 0.1
whatFractionOfLast12MonthBySDBookings= 1.
previousVersion=1.0

In [13]:
attributes, clusters = buildSimpleClusterTree(option=2, destinationPath=destinationPath, staticClusterFilePath=None,
                           storeID=storeID, histrocalDataFilePath=histrocalDataFilePath, categoricalAttributes=categoricalAttributes,
                           manualSeasonality=None, seasonalityDataPath=None, continuousVsDiscrete=continuousVsDiscrete,
                           keyStringForSeasonality='arrival', mergingCentroidsVsconqueringNodes = mergingCentroidsVsconqueringNodes,
                           balancedPrune=balancedPrune, conquerDataColumns=conquerDataColumns,
                           min_y=min_y, min_split_fraction=min_split_fraction,
                           whatFractionOfLast12MonthBySDBookings=whatFractionOfLast12MonthBySDBookings,
                           previousVersion=previousVersion, missingDataTolerance=0.90,delimiter=';', testMode=True)

2020-04-30 13:08:48,321 - 4533695936 - INFO - All columns after feature engineering and before dropping any: ['languagespanish', 'leaddays', 'languageenglish', 'numberofpeople']
2020-04-30 13:08:48,326 - 4533695936 - INFO - read 9
2020-04-30 13:08:48,327 - 4533695936 - INFO - attribute names: [('id', <class 'float'>), ('leaddays', <class 'float'>), ('numberofpeople', <class 'float'>), ('languagespanish', <class 'float'>), ('languageenglish', <class 'float'>)]
2020-04-30 13:08:48,328 - 4533695936 - INFO - This is the starting point!
2020-04-30 13:08:48,331 - 4533695936 - INFO - At this level the best cut is found on Attribute: leaddays, at: 25.0
2020-04-30 13:08:48,332 - 4533695936 - INFO - At previous level(0), the cut happened on Attribute:leaddays, Value: 25.0
2020-04-30 13:08:48,335 - 4533695936 - INFO - At this level the best cut is found on Attribute: leaddays, at: 14.0
2020-04-30 13:08:48,336 - 4533695936 - INFO - At previous level(1), the cut happened on Attribute:leaddays, Valu

2020-04-30 13:08:48,391 - 4533695936 - INFO - Printing MergeList: []
2020-04-30 13:08:48,392 - 4533695936 - INFO - Printing Touching Nodes: {}
2020-04-30 13:08:48,394 - 4533695936 - INFO - Min_Y: 3.0, Scenario: dict_values([4, 3, 2]),  #Clusters: 2.99, argminMetric: 0.22297134511261876
2020-04-30 13:08:48,395 - 4533695936 - INFO - Min_Y: 3.0, Scenario: dict_values([3, 6]),  #Clusters: 2, argminMetric: 0.47542782009509
2020-04-30 13:08:48,396 - 4533695936 - INFO - Searching for 'min_y'. Current 'min_y' = 3.0, 'result' = 
{'intra-cluster-distance': 0.23309748999337407, 'inter-cluster-distance': 1.0454145570841884, 'purity': 0.22297134511261876, 'varPurity': 0.9558029080856842, 'mean-members': 3.0, 'data-points': {'PPTData_Model__cluster_0': 4, 'PPTData_Model__cluster_1': 3, 'PPTData_Model__cluster_DEFAULT': 2}, 'argminMetric': 0.22297134511261876}
2020-04-30 13:08:48,397 - 4533695936 - INFO - Printing MergeList: []
2020-04-30 13:08:48,397 - 4533695936 - INFO - Printing Touching Nodes: {}

In [14]:
clusters

{'pptdata_model__cluster_0',
 'pptdata_model__cluster_1',
 'pptdata_model__cluster_default'}

In [15]:
model2 = readPicklefile('./PPTData_Model.pickle')
df = pd.read_csv('PPTData.csv', delimiter=";")
preds = []
for i in range(len(df)):
    info = dict(df.loc[i])
    pred, _ = model2.getClusterID(info) 
    preds.append(pred)
df['Predicted_Cluster'] = preds

In [16]:
df

Unnamed: 0,LeadDays,NumberOfPeople,LanguageEnglish,LanguageSpanish,Predicted_Cluster
0,10,1,0,1,pptdata_model__cluster_1
1,12,2,0,1,pptdata_model__cluster_1
2,35,1,1,0,pptdata_model__cluster_default
3,25,2,1,0,pptdata_model__cluster_default
4,1,1,1,0,pptdata_model__cluster_0
5,7,2,1,0,pptdata_model__cluster_0
6,3,2,1,0,pptdata_model__cluster_0
7,14,3,0,1,pptdata_model__cluster_1
8,5,1,1,0,pptdata_model__cluster_0
