In [2]:
import numpy as np
import time 
import causaltensor as ct
from causaltensor import low_rank_M0_Gamma
from causaltensor.matlib import generate_Z
from causaltensor.cauest import std_debiased_convex
from causaltensor.cauest import projection_T_orthogonal
from causaltensor.cauest import DC_PR_with_suggested_rank
from causaltensor.cauest import non_convex_algorithm
from causaltensor.cauest import DC_PR_auto_rank
import pandas as pd
import pickle

In [3]:
def read_data(scenario='beer'):
    if scenario == 'beer':
        df = pd.read_csv(open('dataset/beer_filter.csv'))
        beer_data = np.array(df.drop(['ID'], axis = 1))
        return beer_data

    if scenario == 'tobacco':
        df = pd.read_csv('prop99.csv')  ##input csv file
        df = df[df['SubMeasureDesc'] == 'Cigarette Consumption (Pack Sales Per Capita)'] ## extract the metric that we want
        pivot = df.pivot_table(values='Data_Value', index='LocationDesc', columns=['Year']) ## obtain the desired pivot table: index: state-name, column: year, value: per captita consumption

        dfProp99 = pd.DataFrame(pivot.to_records())
        allColumns = dfProp99.columns.values
        states = list(np.unique(dfProp99['LocationDesc']))
        years = allColumns[1:]
        O = dfProp99[years].values
        select = []
        remove_list = ['Massachusetts', 'Arizona', 'Oregon', 'Florida', 'Alaska', 'Hawaii', 'Maryland', 'Michigan', 'New Jersey', 'New York', 'Washington', 'District of Columbia', 'California']
        for i in range(O.shape[0]):
            if (states[i] not in remove_list):
                select.append(i)
        O = O[select, :]

        end_index = 2001 - 1970
        return O[:, :end_index]
        
    if scenario == 'sales':
        O = pickle.load(open('sales.p', 'rb'))
        return O

## Semi-synthetic experiments on Sales data

In [7]:
M0 = read_data('sales')
#M0 = read_data('tobacco')
s = np.linalg.svd(M0, full_matrices=False, compute_uv=False)

In [8]:
print(np.sum(np.cumsum(s**2) / np.sum(s**2) <= 0.998))

34


In [None]:
M0 = read_data('sales')
s = np.linalg.svd(M0, full_matrices=False, compute_uv=False)
#print(s)
sigma = 0
suggest_r = 35

def sales_experiment_performance_run_results(num_experiment=1, sigma = 0.1, pattern = 'block', suggest_r = 10):
    samples = np.zeros(num_experiment)
    t1 = time.time()

    algo_list = ['convex_debias', 'missing', 'OLS', 'PCA']

    if (pattern == 'block'):
        algo_list.append('robust_synthetic_control')

    datas = np.zeros((num_experiment, len(algo_list)))

    (n1, n2) = M0.shape

    suggest_l = -1
    if (suggest_r != -1):
        s = np.linalg.svd(M0, full_matrices=False, compute_uv=False)
        suggest_l = s[suggest_r]*1.1

    for T in range(num_experiment):
        if (T % 100 == 0):
            print(time.time() - t1)
            print('experiment ', T)

        ## generating stagger pattern Z
        if (pattern == 'stagger'):
            m1 = np.random.randint(low=1, high=n1)
            m2 = np.random.randint(low=int(n2/5), high=n2)
            Z = generate_Z(pattern_tuple=['stagger', (m1, m2)], M0=M0)
            treat_units = []

        if (pattern == 'block'):
            m1 = np.random.randint(low=1, high=int(n1/3))
            m2 = np.random.randint(low=int(n2/2), high=n2)
            #m1 = 8
            #m2 = 10
            Z, treat_units = generate_Z(pattern_tuple=['block', (m1, m2)], M0=M0)

        if (pattern == 'adaptive'):
            while True:
                a = np.random.randint(20)+5
                b = np.random.randint(20)+5
                Z, info = generate_Z(pattern_tuple = ['adaptive', (a, b)], M0=M0)
                if (info == 'fail'):
                    continue
                break
        print('***sparsity****', np.sum(Z) / np.size(Z))

        tau_star_o = np.mean(M0)/5

        E = np.random.normal(loc=0, scale=sigma, size=M0.shape)

        def test():
            delta = np.random.normal(loc = 0, scale = tau_star_o, size = (n1, 1)) * np.ones((n1, n2))
            #print(delta)
            d1 = np.sum(Z * delta) / np.sum(Z)
            delta = delta - d1
            tau_star = tau_star_o + d1

            O = M0 + Z*delta + tau_star * Z + E     

            #O = M0 + Z * tau_star + E
            E_op = np.linalg.norm(E + Z*delta, ord=2)

            results = run_algo(algo_list, O, Z, suggest_r = suggest_r, suggest_l = suggest_l, eps = 1e-1, de_mean_O=False, treat_units=[], tau_star = tau_star, m2 = 0, M0 = M0, real_data = True)
            
            error_metric = {}
            for algo in algo_list:
                (M, tau) = results[algo]
                error_metric[algo] = metric_compute(M, tau, M0, tau_star, Z, ['tau_diff'])['tau_diff']
            return error_metric

        error_metric = test()
        print(error_metric)
        for index, algo in enumerate(algo_list):
                datas[T, index] = error_metric[algo]
        print('experiment {}, time elapses '.format(T), time.time() - t1)
    datas = pd.DataFrame(datas, columns = algo_list)
    return datas


np.random.seed(1)
datas = sales_experiment_performance_run_results(sigma = sigma, num_experiment = 1000, pattern = 'adaptive', suggest_r = suggest_r)