In [1]:
import pandas as pd
import numpy as np
import statistics
from scipy import optimize
from scipy.stats import logistic

from sklearn.linear_model import LinearRegression

data = pd.read_csv('../dataset/carSales1.csv')
data['Year'] = data['Year'].apply(lambda year_string: year_string.split('/')[0])

In [2]:
def point_power_law(phi):
    """

    :param phi:
    :return: [breakdown_value, observation_value, predict_value]
    """
    ordered_phi = {k: v for k, v in sorted(phi.items(), key=lambda item: item[1], reverse=True)}
    keys = list(ordered_phi.keys())
    values = list(ordered_phi.values())
    max_value = max(values)
    ydata = []
    for i in values:
        if i != max_value and i not in ydata:
            ydata.append(i)
    xdata = range(2, len(ydata) + 2)
    logx = np.log10(xdata)
    logy = np.log10(ydata)
    pinit = [1.0, -1.0]

    fitfunc = lambda p, x: p[0] + p[1] * x
    errfunc = lambda p, x, y: (y - fitfunc(p, x))
    powerLawFunc = lambda amp, index, x: amp * (x ** index)

    try:
        out = optimize.leastsq(errfunc, pinit, args=(logx, logy), full_output=1)

        pfinal = out[0]
        covar = out[1]

        index = pfinal[1]
        amp = 10.0 ** pfinal[0]

        indexErr = np.sqrt(covar[1][1])
        ampErr = np.sqrt(covar[0][0]) * amp

        return [keys[0], values[0], powerLawFunc(amp, index, 1)]

    except TypeError:
        return [keys[0], values[0], -1]

In [3]:
def calc_top1_insight(subspace_df, breakdown, measure):
    breakdown_measure_dict = dict(zip(list(subspace_df[breakdown]),list(subspace_df[measure])))
    absolute_impact = subspace_df[measure].sum()
    res = point_power_law(breakdown_measure_dict)
    result_dict = dict(zip(['breakdown','breakdown_value', 'measure', 'predict', 'abs_impact', 'insight', 'insight_type'], [breakdown] + res + [absolute_impact, 'top1', 'point']))
    return result_dict

def get_subspace_df(subspace_condition_dict, df):
    condition = pd.Series(True, index=df.index)
    for feature in subspace_condition_dict:
        if subspace_condition_dict[feature] == '*':
            continue
        condition = condition & (df[feature] == subspace_condition_dict[feature])
    return df[condition]

def calc_top1_insights(subspace_condition_dict, breakdown, measure, df):
    subspace_df = get_subspace_df(subspace_condition_dict, df)
    if subspace_df.shape[0] == 0:
        return None
    top1_insight = dict(subspace_condition_dict, **calc_top1_insight(subspace_df, breakdown, measure))
    return top1_insight

def calc_trend_insights(subspace_condition_dict, breakdown, measure, df):
    subspace_df = get_subspace_df(subspace_condition_dict, df)
    if subspace_df.shape[0] == 0 or breakdown != 'Year' or subspace_condition_dict['Year'] != '*':
        return None
    absolute_impact = subspace_df[measure].sum()
    #  calcuate trend insight    
    x = subspace_df[breakdown].values #
    y = subspace_df[measure].values #
    x = x.reshape(-1, 1) #
    y = y.reshape(-1, 1) #
    reg = LinearRegression().fit(x, y) #
    slope = reg.coef_[0] #
    r_sq = reg.score(x, y) #
    # TODO: sig, impact     
    result_dict = dict(zip(['breakdown','breakdown_value', 'measure', 'predict', 'abs_impact', 'insight', 'insight_type'], 
                           [breakdown, -1] + ['-1', '-1'] + [absolute_impact, 'trend', 'shape']))
 
    trend_insight = dict(subspace_condition_dict, **result_dict)
    return trend_insight


In [4]:

def generate_array(arr, index, val):
    new_arr = arr[:]
    new_arr[index] = val
    return new_arr

def generate_process_node(feature_names, measure, output_array, df, calc_insights = lambda: "Need to define", breakdowns = []):
    def process_node(node):
        current_breakdowns = feature_names if len(breakdowns) == 0 else breakdowns
        for breakdown in current_breakdowns:
            insight = calc_insights(subspace_condition_dict=dict(zip(feature_names, node)),
                         breakdown=breakdown, measure=measure, df = df)
            if insight:
                output_array.append(insight)
        if node.count('*') == 1:
            return False
        return True
    return process_node

def BFS_tranverse_and_process(df, feature_names, process_node = lambda x: True):
    feature_unique_value_matrix = [df[feature].unique().tolist() for feature in feature_names]
    traverse_root = ['*' for i in feature_names] + [[]]
    result_stack = [traverse_root] 
    
    while len(result_stack) > 0:
        root_with_ban = result_stack.pop(0)
        root, ban = root_with_ban[:-1], root_with_ban[-1]
        result = process_node(root)
        if result == False:
            continue
        banned_index = ban[:]
        for col_index, val_index in enumerate(root):
            if col_index in banned_index:
                continue
            banned_index.append(col_index)
            banned_index = list(set(banned_index))
            if val_index == '*':
                result_stack += ([generate_array(root, col_index, val)+[banned_index[:]]  for val in feature_unique_value_matrix[col_index]])

In [5]:
output= []
process_node_trend = generate_process_node(feature_names = ['Year', 'Brand', 'Category'], 
                                     output_array=output,
                                     df = data,
                                     measure = 'Sales',
                                     calc_insights=calc_trend_insights)


BFS_tranverse_and_process(data,['Year', 'Brand', 'Category'], process_node=process_node_trend)

process_node_top1 = generate_process_node(feature_names = ['Year', 'Brand', 'Category'], 
                                     output_array=output,
                                     df = data,
                                     measure = 'Sales',
                                     calc_insights=calc_top1_insights)

BFS_tranverse_and_process(data,['Year', 'Brand', 'Category'], process_node=process_node_top1)

df = pd.DataFrame(output)

In [6]:
df

Unnamed: 0,Year,Brand,Category,breakdown,breakdown_value,measure,predict,abs_impact,insight,insight_type
0,*,*,*,Year,-1,-1,-1,21921768,trend,shape
1,*,BMW,*,Year,-1,-1,-1,1220996,trend,shape
2,*,Ford,*,Year,-1,-1,-1,5489641,trend,shape
3,*,GMC,*,Year,-1,-1,-1,1690029,trend,shape
4,*,Honda,*,Year,-1,-1,-1,4431426,trend,shape
...,...,...,...,...,...,...,...,...,...,...
458,*,Volkswagen,Subcompact,Brand,Volkswagen,6468,-1,94588,top1,point
459,*,Volkswagen,Subcompact,Category,Subcompact,6468,-1,94588,top1,point
460,*,Volkswagen,SUV,Year,2007,8812,12267.1,110774,top1,point
461,*,Volkswagen,SUV,Brand,Volkswagen,7535,-1,110774,top1,point
