# Introduction

In our setting users make a binary decision of whether or not to consume an item. Each item is represented by a vector $x\in\mathbb{R}^n$, and the outcome is represented using binary variable $y\in\left\{-1,1\right\}$, such that $y=1$ when the item was consumed. 

For the implementation of behavioral bias models, we define the following abstract class:

In [None]:
import abc

class UserModel(abc.ABC):
  @abc.abstractclassmethod
  def __call__(self, X, *args):
    '''
    Given items X, calculate the user's valuation v(x) for each item x
    '''
    raise NotImplementedError()
  
  def predict(self, X, *args):
    '''
    Given items X, calculate the user's preference (-1/1) for each item x
    '''
    raise NotImplementedError()


class PopulationModel(abc.ABC):
  @abc.abstractclassmethod
  def __call__(self, X, *args):
    '''
    Given items X, calculate each user's valuation v(x) for each item x
    '''
    raise NotImplementedError()
  
  def predict(self, X, *args):
    '''
    Given items X, calculate each user's preference y for each item x
    '''
    raise NotImplementedError()


## **A simple example**

**Create a user model**

In [None]:
import numpy as np

class NoiseUserModel(UserModel):
  """
  v(x) = u*x + noise
  for a randomly generated u vector
  """

  def __init__(self, num_features):
    self.num_features = num_features
    self.utility_vec = np.random.randn(num_features)

  def __call__(self, X, noise_std):
    noise = noise_std * np.random.randn(X.shape[0])
    return X@self.utility_vec + noise
  
  def predict(self, X, noise_std):
    return np.sign(self(X, noise_std))

**Create a population model**

In [None]:
class NoisePopulationModel(PopulationModel):
  """
  aggregates a population of users into a population model 
  """

  def __init__(self, num_users, num_features):
    self.user_models = [NoiseUserModel(num_features) for i in range(num_users)]

  def __call__(self, X, noise_std):
    """
    returns a matrix of size (number of items X number of uesrs)
    """
    return np.array([m(X, noise_std) for m in self.user_models]).T
  
  def predict(self, X, noise_std):
    return np.sign(self(X, noise_std))

**Load dataset**

In [None]:
def generate_synth_data(num_features, num_items):
  """
  Generates a random dataset of items of size (num_items X num_features)
  """
  items = np.random.randn(num_items, num_features)
  return items

num_features = 3
num_items = 100
num_users = 10

# Create a population model
population_model = NoisePopulationModel(num_users, num_features)

# Generate items
X = generate_synth_data(num_features, num_items)
args = {"noise_std":0.5}

# calculate users preferences according to the model
y = population_model.predict(X, **args)

**Create a training pipeline**

In [None]:
from sklearn.model_selection import train_test_split

class TrainingPipeline():
  def __init__(self, training_model, metrics, test_size=0.33,
               seed=0):
    """
    training_model: a scikit training model
    metrics: a dictionary of metric functions (values) and their names (keys).
    each one takes (training_model, population_model, model_args, X) as arguments.
    """
    self.training_model = training_model
    self.metrics = metrics
    self.test_size = test_size
    self.seed = seed

  # Splits the data into train and test sets
  def split(self, X, y):
    return train_test_split(X, y, test_size=self.test_size, random_state=self.seed)

  # train the model on the train dataset
  def fit(self, X_train, y_train):
    self.training_model.fit(X_train, y_train)

  # activate each metric on the the test set
  def evaluate(self, population_model, model_args, X_test):
    if model_args is None:
      model_args = {}
    return {name : metric(self.training_model, population_model, model_args, X_test) for name, metric in
            metrics.items()}

  # Split, fit and evaluate
  def __call__(self, X, y, population_model, model_args=None):
    """
    Calls split, then trains the model and then evaluates it according to each metric 
    """
    X_train, X_test, y_train, y_test = self.split(X, y)
    self.fit(X_train, y_train)
    return self.evaluate(population_model, model_args, X_test)

**Create some evaluation metrics**

In [None]:
from sklearn.metrics import accuracy_score

# calculates the accuracy of the training_model
def accuracy(training_model, population_model, model_args, X):
  true_valuations = population_model(X, **model_args)
  pred_valuations = training_model.predict(X)

  return accuracy_score(np.sign(true_valuations).flatten(),
                        np.sign(pred_valuations).flatten())

**Train!**

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

# Create linear regression object
training_model = LinearRegression()

# Create metrics list
metrics = {"accuracy" : accuracy}

# Create pipeline
t = TrainingPipeline(training_model, metrics)

# Train and evaluate
args = {"noise_std": 0.5}
results = t(X, y, population_model, args)
results

{'accuracy': 0.8848484848484849}