In [1]:
import numpy as np
import pandas as pd

import sys
sys.path.append("..")

from experiments.utils import compute_significance

from hypothesis_exploration.user_data_model import Dataset, Group, coverage, diversity
from hypothesis_exploration.hypothesis_testing import HypothesisTest
from hypothesis_exploration.alpha_investing import covdiv_alpha

from datasets.MovieLens import params as movie_params

In [2]:
dataframe = pd.read_csv('../datasets/MovieLens/MovieLens.csv')

dataset = Dataset(
    dataframe=dataframe,
    multi_value_attribute_names=movie_params.multi_value_attribute_names,
    attributes=movie_params.attributes,
    action_dimension=movie_params.action_dimension,
    action_dimension_min=movie_params.action_dimension_min,
    action_dimension_max=movie_params.action_dimension_max
)

In [29]:
def run_covdiv(predicates, wealth, h):
    g_in = Group(dataset=dataset, predicates=predicates)
    G_out, wealth = covdiv_alpha(
        D=dataset,
        g_in=g_in,
        h=h,
        alpha=alpha,
        n=n,
        wealth=wealth,
        gamma=gamma,
        lambd=lambd,
        request_history=request_history
    )
    return G_out, wealth

In [4]:
dataframe.runtime_minutes.value_counts()

runtime_minutes
Long         989970
Short          3371
Very Long      3315
Name: count, dtype: int64

In [5]:
eta = 1
alpha = 0.05
gamma = 500
lambd = 1
n = 3
request_history = {}

wealth = eta * alpha

In [38]:
h1 = HypothesisTest(aggregation='mean', null_value=4.0, alternative='greater', n_sample=HypothesisTest.ONE_SAMPLE)
h2 = HypothesisTest(aggregation='mean', null_value=3.5, alternative='less', n_sample=HypothesisTest.ONE_SAMPLE)
h3 = HypothesisTest(aggregation='variance', null_value=1, alternative='greater', n_sample=HypothesisTest.ONE_SAMPLE)

In [30]:
G_out, wealth = run_covdiv({'runtime_minutes': 'Long'}, wealth, h3)
for g in G_out:
    print(g)

genre:Drama|runtime_minutes:Long
location:HI|runtime_minutes:Long
location:NE|runtime_minutes:Long


In [31]:
G_out, wealth = run_covdiv({'runtime_minutes': 'Long'}, wealth, h1)
for g in G_out:
    print(g)

genre:Film-Noir|runtime_minutes:Long
runtime_minutes:Long|year:40s


In [32]:
G_out, wealth = run_covdiv({'runtime_minutes': 'Long'}, wealth, h2)
for g in G_out:
    print(g)

runtime_minutes:Long|year:2000s
genre:Horror|runtime_minutes:Long


In [33]:
G_out, wealth = run_covdiv({'runtime_minutes': 'Long', 'genre': 'Drama'}, wealth, h1)
for g in G_out:
    print(g)

genre:Drama|runtime_minutes:Long|year:30s
genre:Drama|runtime_minutes:Long|year:60s
genre:Drama|runtime_minutes:Long|year:50s


In [43]:
G_out, wealth = run_covdiv({'runtime_minutes': 'Long', 'location': 'NE'}, wealth, h2)
for g in G_out:
    print(g)

location:NE|runtime_minutes:Long|year:90s
genre:Comedy|location:NE|runtime_minutes:Long


In [50]:
G_out, wealth = run_covdiv({'runtime_minutes': 'Long', 'location': 'HI'}, wealth, h1)
for g in G_out:
    print(g)