In [1]:
import random
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# import warnings
# warnings.filterwarnings("error")

import sys
sys.path.append("..")

from statsmodels.stats.multitest import multipletests

from hypothesis_exploration.user_data_model import Dataset, Group, coverage, diversity, jaccard_distance
from hypothesis_exploration.hypothesis_testing import HypothesisTest
from hypothesis_exploration.alpha_investing import covdiv_alpha, cover_alpha

In [2]:
from datasets.MovieLens import params as movie_params
from datasets.BookCrossing import params as book_params
from datasets.Yelp import params as yelp_params

In [3]:
def compute_significance(request_history, alpha):
    pvals = [v[0] for v in request_history.values()]
    rejects = [v[1] for v in request_history.values()]
    ground_truth_reject, ground_truth_pvals, _, ground_truth_alphacBonf = multipletests(pvals, alpha=alpha, method='bonferroni')
    num_rejects_ground_truth = sum([1 if reject else 0 for reject in ground_truth_reject])
    num_rejects = sum([1 if reject else 0 for reject in rejects])
    true_positives = sum([1 if reject and reject_ground_truth else 0 for reject, reject_ground_truth in zip(rejects, ground_truth_reject)])
    false_positives = sum([1 if reject and not reject_ground_truth else 0 for reject, reject_ground_truth in zip(rejects, ground_truth_reject)])

    if num_rejects_ground_truth > 0:
        power = true_positives / num_rejects_ground_truth
    else:
        power = np.nan
    
    if num_rejects > 0:
        fdr = false_positives / num_rejects
    else:
        fdr = np.nan

    return power, fdr


def run_alpha_investing(algorithm, has_lambd, **kwargs):
    if not has_lambd:
        if 'lambd' in kwargs:
            del kwargs['lambd']

    request_history = {}

    t0 = time.time()
    G_out, wealth = algorithm(**kwargs, request_history=request_history)
    t1 = time.time()

    execution_time = t1 - t0
    cov = coverage(G_out, kwargs['g_in'])
    div = diversity(G_out)

    power, fdr = compute_significance(request_history, kwargs['alpha'])

    return G_out, wealth, cov, div, power, fdr, execution_time

In [4]:
movie_dataframe = pd.read_csv('../datasets/MovieLens/MovieLens.csv')
book_dataframe = pd.read_csv('../datasets/BookCrossing/BookCrossing.csv')
yelp_dataframe = pd.read_csv('../datasets/Yelp/Yelp.csv')

In [5]:
movie_dataset = Dataset(
    dataframe=movie_dataframe,
    multi_value_attribute_names=movie_params.multi_value_attribute_names,
    attributes=movie_params.attributes,
    action_dimension=movie_params.action_dimension,
    action_dimension_min=movie_params.action_dimension_min,
    action_dimension_max=movie_params.action_dimension_max
)

book_dataset = Dataset(
    dataframe=book_dataframe,
    multi_value_attribute_names=book_params.multi_value_attribute_names,
    attributes=book_params.attributes,
    action_dimension=book_params.action_dimension,
    action_dimension_min=book_params.action_dimension_min,
    action_dimension_max=book_params.action_dimension_max
)

yelp_dataset = Dataset(
    dataframe=yelp_dataframe,
    multi_value_attribute_names=yelp_params.multi_value_attribute_names,
    attributes=yelp_params.attributes,
    action_dimension=yelp_params.action_dimension,
    action_dimension_min=yelp_params.action_dimension_min,
    action_dimension_max=yelp_params.action_dimension_max
)

In [28]:
possible_attributes = {att: [] for att in movie_dataset.attributes.keys()}
for att in movie_dataset.attributes:
    vals_and_predicates = []
    for val in movie_dataset.attributes[att]:
        g = Group(dataset=movie_dataset, predicates={att: val})
        vals_and_predicates.append((len(g.user_ids), val))
    possible_attributes[att] = [v_p[1] for v_p in sorted(vals_and_predicates, key = lambda x: x[0], reverse=True)[:5]]

In [26]:
[v_p[1] for v_p in sorted(vals_and_predicates, key = lambda x: x[0], reverse=True)[:5]]

['Drama', 'Comedy', 'Action', 'Thriller', 'Romance']

In [29]:
possible_attributes

{'genre': ['Drama', 'Comedy', 'Action', 'Thriller', 'Romance'],
 'runtime_minutes': ['Long', 'Very Long', 'Short'],
 'year': ['90s', '80s', '70s', '60s', '2000s'],
 'gender': ['M', 'F'],
 'age': ['25-34', '35-44', '18-24', '45-49', '50-55'],
 'occupation': ['college-grad student',
  'other',
  'executive-managerial',
  'academic-educator',
  'technician-engineer'],
 'location': ['CA', 'NY', 'MN', 'TX', 'MA']}

In [30]:
movie_dataframe.location.value_counts()

location
CA    179806
NY     69514
MN     63583
TX     51812
IL     51510
MA     44506
OH     39665
MI     37233
WA     37087
FL     28646
VA     27095
PA     26464
OR     21765
WI     21685
NJ     20823
CO     18361
GA     17467
CT     15336
NC     15169
AZ     15036
MD     14716
MO     13667
TN     12277
IN     11033
KS      8845
OK      8424
ME      8410
IA      7798
KY      6768
SC      6001
NH      5961
LA      5853
UT      5235
DC      5177
NE      4871
NM      4813
AR      4482
NV      3994
DE      3778
HI      3761
WV      3680
AL      3486
ID      2539
RI      2523
VT      2424
MT      2292
SD      1932
WY      1147
ND      1022
AK       910
PR       751
GU       679
AE       604
MS       555
AP       211
Name: count, dtype: int64

In [27]:
sorted(vals_and_predicates, key = lambda x: x[0], reverse=True)

[(6037, 'Drama'),
 (6031, 'Comedy'),
 (6012, 'Action'),
 (5988, 'Thriller'),
 (5961, 'Romance'),
 (5909, 'Sci-Fi'),
 (5893, 'Adventure'),
 (5761, 'War'),
 (5662, 'Crime'),
 (5297, 'Horror'),
 (5278, "Children's"),
 (5133, 'Mystery'),
 (4850, 'Fantasy'),
 (4787, 'Animation'),
 (4753, 'Musical'),
 (4150, 'Film-Noir'),
 (4100, 'Western'),
 (2239, 'Documentary')]

In [17]:
possible_attributes

{'genre': ['Comedy',
  'Drama',
  'Comedy|Romance',
  'Comedy|Drama',
  'Drama|Romance'],
 'runtime_minutes': ['Long', 'Short', 'Very Long'],
 'year': ['90s', '80s', '70s', '60s', '2000s'],
 'gender': ['M', 'F'],
 'age': ['25-34', '35-44', '18-24', '45-49', '50-55'],
 'occupation': ['college-grad student',
  'other',
  'executive-managerial',
  'academic-educator',
  'technician-engineer'],
 'location': ['CA', 'NY', 'MN', 'TX', 'IL']}

In [9]:
eta = 1
alpha = 0.05
gamma = 500
lambd = 1
n = 10
initial_wealth = eta * alpha
g_in = Group(dataset=yelp_dataset, predicates={'city': 'Montréal'})
h = HypothesisTest(aggregation='mean', null_value=3, alternative='greater', n_sample=HypothesisTest.ONE_SAMPLE)


for name, algorithm in {'cover': cover_alpha, 'covdiv': covdiv_alpha}.items():
    print(name)
    print('\t', g_in)
    G_out, wealth, cov, div, power, fdr, execution_time = run_alpha_investing(
        algorithm,
        (name == 'covdiv'),
        D=yelp_dataset,
        g_in=g_in,
        h=h,
        alpha=alpha,
        n=n,
        wealth=initial_wealth,
        gamma=gamma,
        lambd=lambd,
    )
    
    sample_size = 0
    
    for g in G_out:
        print('\t\t' + str(g) + ' | User count: ' + str(len(g.user_ids)) + ' | Sample size: ' + str(len(g.sample)))
        sample_size += len(g.sample)
    print('\t\t\tSample size:', sample_size)
    print('\t\t\tCoverage:', coverage(G_out, g_in))
    print('\t\t\tDiversity:', diversity(G_out))
    print('\t\t\tPower:', power)
    print('\t\t\tFDR:', fdr)
    print('\t\t\tTime:', execution_time)

cover
	 city:Montréal
		category:Restaurant|city:Montréal | User count: 37536 | Sample size: 106517
		category:Home Services|city:Montréal | User count: 672 | Sample size: 763
		city:Montréal|fans:unpopular | User count: 27454 | Sample size: 47054
		city:Montréal|fans:popular | User count: 3347 | Sample size: 33742
		category:Mass Media|city:Montréal | User count: 34 | Sample size: 37
		category:Local Services|city:Montréal | User count: 757 | Sample size: 923
		city:Montréal|fans:semipopular | User count: 12659 | Sample size: 56126
		category:Event Planning & Services|city:Montréal | User count: 4573 | Sample size: 5617
		category:Education|city:Montréal | User count: 379 | Sample size: 439
		category:Restaurants|city:Montréal | User count: 37523 | Sample size: 106380
			Sample size: 357598
			Coverage: 1.0
			Diversity: 41.045549378976695
			Power: 1.0
			FDR: 0.0
			Time: 2.4212870597839355
covdiv
	 city:Montréal
		category:Beauty & Spas|city:Montréal | User count: 2170 | Sample siz