In [1]:
import tensorflow as tf
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from tqdm import tqdm
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import os
import importlib

# dir_r3 = 'C:/Users/Sten Stokroos/Desktop/NEW/zelf/Data/out'
dir_ml = 'C:/Users/Sten Stokroos/Desktop/NEW/zelf/Data/out'
randseed = 42
print("random seed: ", randseed)
np.random.seed(randseed)

def choose_data(dat, test_size, val_size):
    if dat == 'ml2':
        train = pd.read_csv(os.path.join(dir_ml, 'ml_train2.csv'), sep="\t", header=None, names=['userId', 'songId', 'rating'], usecols=[0, 1, 2], engine="python")
        test = pd.read_csv(os.path.join(dir_ml, 'ml_test2.csv'), sep="\t", header=None, names=['userId', 'songId', 'rating'], usecols=[0, 1, 2], engine="python")
        val = None  # Assuming no validation set for 'ml2'
    elif dat == 'ml':
        ml_full = pd.read_csv(os.path.join(dir_ml, 'ml-1m_full.csv'), sep="\t", header=None, names=['userId', 'songId', 'rating'], usecols=[0, 1, 2], engine="python")

        # Get unique user IDs

        user_ids = ml_full['userId'].unique()
        item_ids = ml_full['songId'].unique()

        n_users = len(user_ids)
        n_items = len(item_ids)
        # Split user IDs for train, validation, and test sets
        train, test = train_test_split(ml_full, test_size=test_size, random_state=42)
        train, val = train_test_split(train, test_size=val_size/(1-test_size), random_state=42)

    else:
        print('Wrong data input')
        return None, None, None

    # Print the sizes of the datasets
    print(f"Train set size: {train.shape[0]} ratings")
    print(f"Validation set size: {val.shape[0] if val is not None else 0} ratings")
    print(f"Test set size: {test.shape[0]} ratings")
    print(ml_full.shape[0])
    print(train.shape[0] + val.shape[0] +  test.shape[0])

    return train, val, test, n_users, n_items

def load_confounders(dat, k):
    CAUSEFIT_DIR = f'C:/Users/Sten Stokroos/Desktop/NEW/zelf/Data/exposure_output/{dat}_exp_k_{k}.csv'
    conf_df = pd.read_csv(CAUSEFIT_DIR, header=None)
    confounder_data = conf_df.to_numpy().T
    return confounder_data

def load_data_rating(dat, columns=[0, 1, 2], sep="\t", include_validation=False, test_size=0.1, val_size=0.1):
    train, val, test, n_users, n_items = choose_data(dat, test_size, val_size)

    if train is None or test is None:
        return None, None, None, None, None

    if include_validation:
        # Use the provided validation set
        vad = val
    else:
        # Merge train and validation sets if validation is not required, and sort by userId
        train = pd.concat([train, val]).sort_values(by='userId').reset_index(drop=True)
        vad = None

    def build_matrix(df):
        rows = []
        cols = []
        ratings = []
        for line in df.itertuples():
            rows.append(line[1])
            cols.append(line[2])
            ratings.append(line[3])
        return csr_matrix((ratings, (rows, cols)), shape=(n_users, n_items)).todok()

    train_matrix = build_matrix(train)
    test_matrix = build_matrix(test)
    vad_matrix = build_matrix(vad) if vad is not None else None

    print("Load data finished. Number of users:", n_users, "Number of items:", n_items)
    return train_matrix, test_matrix, vad_matrix, n_users, n_items






random seed:  42


In [7]:
def run_model(module_name, class_name, k, dat='ml', include_validation=False, use_confounder=False, use_exposure=False, test_size=0.1, val_size=0.1, hidden_neuron=500, learning_rate=0.001, reg_rate=0.1, epoch=20, batch_size=200, verbose=False, T=1, display_step=1000, save_path=None):
    train, test, vad, user, item = load_data_rating(dat, columns=[0, 1, 2], sep="\t", include_validation=include_validation, test_size=test_size, val_size=val_size)

    confounder_data = None
    exposure_data = None

    if use_confounder:
        # Load confounder data
        confounder_data = load_confounders(dat, k)
    if use_exposure:
        # Create exposure matrix
        exposure_data = (train > 0).astype(np.float32).todense().T

    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True

    with tf.compat.v1.Session(config=config) as sess:
        module = importlib.import_module(module_name)
        model_class = getattr(module, class_name)
        final_model = model_class(sess, user, item, learning_rate=learning_rate, reg_rate=reg_rate, epoch=epoch, batch_size=batch_size, verbose=verbose, T=T, display_step=display_step)

        final_model.build_network(hidden_neuron=hidden_neuron)
        
        if confounder_data is not None and exposure_data is not None:
            final_model.execute(train, vad, confounder_data, exposure_data)
        elif confounder_data is not None:
            final_model.execute(train, vad, confounder_data)
        else:
            final_model.execute(train, vad)

        if vad is not None:
            if confounder_data is not None and exposure_data is not None:
                rmse, mae = final_model.test(vad, confounder_data, exposure_data)
            elif confounder_data is not None:
                rmse, mae = final_model.test(vad, confounder_data)
            else:
                rmse, mae = final_model.test(vad)
        else:
            rmse, mae = None, None

    return rmse, mae

def objective_urec1conf(params):
    learning_rate = params['learning_rate']
    reg_rate = params['reg_rate']
    hidden_neuron = params['hidden_neuron']
    k = params['k']

    rmse, mae = run_model('urec_1_conf', 'UAutoRec1conf', k, dat='ml', include_validation=True, use_confounder=True, use_exposure=False, hidden_neuron=hidden_neuron, learning_rate=learning_rate, reg_rate=reg_rate, batch_size=500, epoch=20)
    
    return {'loss': rmse, 'status': STATUS_OK}


def objective_urec2confexp(params):
    learning_rate = params['learning_rate']
    reg_rate = params['reg_rate']
    hidden_neuron = params['hidden_neuron']
    k = params['k']

    rmse, mae = run_model('urec_2_confexp', 'UAutoRec2confexp', k, dat='ml', include_validation=True, use_confounder=True, use_exposure=True, hidden_neuron=hidden_neuron, learning_rate=learning_rate, reg_rate=reg_rate, batch_size=500, epoch=20)
    
    return {'loss': rmse, 'status': STATUS_OK}


In [5]:
space = {
    'learning_rate': hp.uniform('learning_rate', 0.0001, 0.01),
    'reg_rate': hp.choice('reg_rate', [0.001, 0.01, 0.1, 1, 100, 1000]),
    'hidden_neuron': hp.choice('hidden_neuron', [10, 20, 40, 80, 100, 200, 300, 400, 500]),
    'k': hp.choice('k', [1, 2, 5, 10, 20, 32, 50, 100]),
}

trials_urec1conf = Trials()
best_urec1conf = fmin(fn=objective_urec1conf, space=space, algo=tpe.suggest, max_evals=5, trials=trials_urec1conf)

print("Best parameters for UAutoRec1conf:", best_urec1conf)

best_params_urec1conf = {
    'learning_rate': best_urec1conf['learning_rate'],
    'reg_rate': [0.001, 0.01, 0.1, 1, 100, 1000][best_urec1conf['reg_rate']],
    'hidden_neuron': [10, 20, 40, 80, 100, 200, 300, 400, 500][best_urec1conf['hidden_neuron']],
    'k': [1, 2, 5, 10, 20, 32, 50, 100][best_urec1conf['k']]
}

print("Best parameters for UAutoRec1conf in detailed form:", best_params_urec1conf)



  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]

Train set size: 800169 ratings                       
Validation set size: 100022 ratings                  
Test set size: 100022 ratings                        
1000213                                              
1000213                                              
Load data finished. Number of users:                 
6040                                                 
Number of items:                                     
3706                                                 
UAutoRec with Confounder.                            
Train data processed shape: (3706, 6040)             
Confounder data shape: (3706, 6040)                  
  0%|          | 0/5 [02:02<?, ?trial/s, best loss=?]

Training:   0%|          | 0/20 [00:00<?, ?epoch/s]
Training:   0%|          | 0/20 [00:05<?, ?epoch/s, Loss=2.36e+5, RMSE=1.29, MAE=1.07]
Training:   5%|5         | 1/20 [00:05<01:40,  5.29s/epoch, Loss=2.36e+5, RMSE=1.29, MAE=1.07]
Training:   5%|5         | 1/20 [00:10<01:40,  5.29s/epoch, Loss=8.6e+4, RMSE=1.04, MAE=0.813]
Training:  10%|#         | 2/20 [00:10<01:39,  5.54s/epoch, Loss=8.6e+4, RMSE=1.04, MAE=0.813]
Training:  10%|#         | 2/20 [00:16<01:39,  5.54s/epoch, Loss=7.07e+4, RMSE=1.02, MAE=0.803]
Training:  15%|#5        | 3/20 [00:16<01:31,  5.36s/epoch, Loss=7.07e+4, RMSE=1.02, MAE=0.803]
Training:  15%|#5        | 3/20 [00:21<01:31,  5.36s/epoch, Loss=6.77e+4, RMSE=1.01, MAE=0.813]
Training:  20%|##        | 4/20 [00:21<01:24,  5.26s/epoch, Loss=6.77e+4, RMSE=1.01, MAE=0.813]
Training:  20%|##        | 4/20 [00:26<01:24,  5.26s/epoch, Loss=6.62e+4, RMSE=0.998, MAE=0.799]
Training:  25%|##5       | 5/20 [00:26<01:18,  5.20s/epoch, Loss=6.62e+4, RMSE=0.998, MAE=0.799

Train set size: 800169 ratings                                                  
Validation set size: 100022 ratings                                             
Test set size: 100022 ratings                                                   
1000213                                                                         
1000213                                                                         
Load data finished. Number of users:                                            
6040                                                                            
Number of items:                                                                
3706                                                                            
UAutoRec with Confounder.                                                       
Train data processed shape: (3706, 6040)                                        
Confounder data shape: (3706, 6040)                                             
 20%|██        | 1/5 [05:51<

Training:   0%|          | 0/20 [00:00<?, ?epoch/s]
Training:   0%|          | 0/20 [00:05<?, ?epoch/s, Loss=3.09e+5, RMSE=1.16, MAE=0.921]
Training:   5%|5         | 1/20 [00:05<01:35,  5.04s/epoch, Loss=3.09e+5, RMSE=1.16, MAE=0.921]
Training:   5%|5         | 1/20 [00:09<01:35,  5.04s/epoch, Loss=1.31e+5, RMSE=1.11, MAE=0.872]
Training:  10%|#         | 2/20 [00:09<01:21,  4.53s/epoch, Loss=1.31e+5, RMSE=1.11, MAE=0.872]
Training:  10%|#         | 2/20 [00:13<01:21,  4.53s/epoch, Loss=1.08e+5, RMSE=1.06, MAE=0.865]
Training:  15%|#5        | 3/20 [00:13<01:13,  4.34s/epoch, Loss=1.08e+5, RMSE=1.06, MAE=0.865]
Training:  15%|#5        | 3/20 [00:17<01:13,  4.34s/epoch, Loss=9.64e+4, RMSE=1.04, MAE=0.832]
Training:  20%|##        | 4/20 [00:17<01:08,  4.29s/epoch, Loss=9.64e+4, RMSE=1.04, MAE=0.832]
Training:  20%|##        | 4/20 [00:21<01:08,  4.29s/epoch, Loss=9.11e+4, RMSE=1.01, MAE=0.812]
Training:  25%|##5       | 5/20 [00:21<01:03,  4.26s/epoch, Loss=9.11e+4, RMSE=1.01, MAE=0.8

Train set size: 800169 ratings                                                  
Validation set size: 100022 ratings                                             
Test set size: 100022 ratings                                                   
1000213                                                                         
1000213                                                                         
Load data finished. Number of users:                                            
6040                                                                            
Number of items:                                                                
3706                                                                            
UAutoRec with Confounder.                                                       
Train data processed shape: (3706, 6040)                                        
Confounder data shape: (3706, 6040)                                             
 40%|████      | 2/5 [09:23<

Training:   0%|          | 0/20 [00:00<?, ?epoch/s]
Training:   0%|          | 0/20 [00:05<?, ?epoch/s, Loss=4.78e+5, RMSE=1.24, MAE=1.01]
Training:   5%|5         | 1/20 [00:05<01:46,  5.61s/epoch, Loss=4.78e+5, RMSE=1.24, MAE=1.01]
Training:   5%|5         | 1/20 [00:10<01:46,  5.61s/epoch, Loss=1.97e+5, RMSE=1.12, MAE=0.911]
Training:  10%|#         | 2/20 [00:10<01:38,  5.47s/epoch, Loss=1.97e+5, RMSE=1.12, MAE=0.911]
Training:  10%|#         | 2/20 [00:16<01:38,  5.47s/epoch, Loss=1.38e+5, RMSE=1.07, MAE=0.856]
Training:  15%|#5        | 3/20 [00:16<01:31,  5.38s/epoch, Loss=1.38e+5, RMSE=1.07, MAE=0.856]
Training:  15%|#5        | 3/20 [00:21<01:31,  5.38s/epoch, Loss=1.2e+5, RMSE=1.08, MAE=0.867] 
Training:  20%|##        | 4/20 [00:21<01:26,  5.38s/epoch, Loss=1.2e+5, RMSE=1.08, MAE=0.867]
Training:  20%|##        | 4/20 [00:27<01:26,  5.38s/epoch, Loss=1.14e+5, RMSE=1.08, MAE=0.857]
Training:  25%|##5       | 5/20 [00:27<01:23,  5.56s/epoch, Loss=1.14e+5, RMSE=1.08, MAE=0.857]

Train set size: 800169 ratings                                                  
Validation set size: 100022 ratings                                             
Test set size: 100022 ratings                                                   
1000213                                                                         
1000213                                                                         
Load data finished. Number of users:                                            
6040                                                                            
Number of items:                                                                
3706                                                                            
UAutoRec with Confounder.                                                       
Train data processed shape: (3706, 6040)                                        
Confounder data shape: (3706, 6040)                                             
 60%|██████    | 3/5 [13:17<

Training:   0%|          | 0/20 [00:00<?, ?epoch/s]
Training:   0%|          | 0/20 [00:05<?, ?epoch/s, Loss=2.75e+5, RMSE=1.31, MAE=1.08]
Training:   5%|5         | 1/20 [00:05<01:49,  5.76s/epoch, Loss=2.75e+5, RMSE=1.31, MAE=1.08]
Training:   5%|5         | 1/20 [00:11<01:49,  5.76s/epoch, Loss=1.25e+5, RMSE=1.03, MAE=0.808]
Training:  10%|#         | 2/20 [00:11<01:39,  5.54s/epoch, Loss=1.25e+5, RMSE=1.03, MAE=0.808]
Training:  10%|#         | 2/20 [00:16<01:39,  5.54s/epoch, Loss=9.95e+4, RMSE=1.02, MAE=0.803]
Training:  15%|#5        | 3/20 [00:16<01:32,  5.43s/epoch, Loss=9.95e+4, RMSE=1.02, MAE=0.803]
Training:  15%|#5        | 3/20 [00:21<01:32,  5.43s/epoch, Loss=8.74e+4, RMSE=1.01, MAE=0.799]
Training:  20%|##        | 4/20 [00:21<01:24,  5.29s/epoch, Loss=8.74e+4, RMSE=1.01, MAE=0.799]
Training:  20%|##        | 4/20 [00:26<01:24,  5.29s/epoch, Loss=8.14e+4, RMSE=1, MAE=0.797]   
Training:  25%|##5       | 5/20 [00:26<01:19,  5.27s/epoch, Loss=8.14e+4, RMSE=1, MAE=0.797]
T

Train set size: 800169 ratings                                                  
Validation set size: 100022 ratings                                             
Test set size: 100022 ratings                                                   
1000213                                                                         
1000213                                                                         
Load data finished. Number of users:                                            
6040                                                                            
Number of items:                                                                
3706                                                                            
UAutoRec with Confounder.                                                       
Train data processed shape: (3706, 6040)                                        
Confounder data shape: (3706, 6040)                                             
 80%|████████  | 4/5 [17:25<

Training:   0%|          | 0/20 [00:00<?, ?epoch/s]
Training:   0%|          | 0/20 [00:05<?, ?epoch/s, Loss=2.41e+5, RMSE=1.24, MAE=1.03]
Training:   5%|5         | 1/20 [00:05<01:52,  5.94s/epoch, Loss=2.41e+5, RMSE=1.24, MAE=1.03]
Training:   5%|5         | 1/20 [00:11<01:52,  5.94s/epoch, Loss=7.87e+4, RMSE=1.04, MAE=0.84]
Training:  10%|#         | 2/20 [00:11<01:46,  5.92s/epoch, Loss=7.87e+4, RMSE=1.04, MAE=0.84]
Training:  10%|#         | 2/20 [00:17<01:46,  5.92s/epoch, Loss=6.99e+4, RMSE=1.01, MAE=0.797]
Training:  15%|#5        | 3/20 [00:17<01:38,  5.81s/epoch, Loss=6.99e+4, RMSE=1.01, MAE=0.797]
Training:  15%|#5        | 3/20 [00:23<01:38,  5.81s/epoch, Loss=6.74e+4, RMSE=1.01, MAE=0.796]
Training:  20%|##        | 4/20 [00:23<01:33,  5.84s/epoch, Loss=6.74e+4, RMSE=1.01, MAE=0.796]
Training:  20%|##        | 4/20 [00:29<01:33,  5.84s/epoch, Loss=6.65e+4, RMSE=1, MAE=0.796]   
Training:  25%|##5       | 5/20 [00:29<01:29,  5.98s/epoch, Loss=6.65e+4, RMSE=1, MAE=0.796]
Tra

100%|██████████| 5/5 [19:14<00:00, 230.93s/trial, best loss: 0.9571537004565192]
Best parameters for UAutoRec1conf: {'hidden_neuron': 7, 'k': 7, 'learning_rate': 0.0026072943828274455, 'reg_rate': 0}


NameError: name 'best' is not defined

In [10]:
def run_model(module_name, class_name, k, dat='ml', include_validation=False, use_confounder=False, use_exposure=False, test_size=0.1, val_size=0.1, hidden_neuron=500, learning_rate=0.001, reg_rate=0.1, epoch=20, batch_size=200, verbose=False, T=1, display_step=1000, save_path=None):
    train, test, vad, user, item = load_data_rating(dat, columns=[0, 1, 2], sep="\t", include_validation=include_validation, test_size=test_size, val_size=val_size)

    confounder_data = None
    exposure_data = None

    if use_confounder:
        # Load confounder data
        confounder_data = load_confounders(dat, k)
    if use_exposure:
        # Create exposure matrix
        exposure_data = (train > 0).astype(np.float32).todense().T

    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True

    with tf.compat.v1.Session(config=config) as sess:
        module = importlib.import_module(module_name)
        model_class = getattr(module, class_name)
        final_model = model_class(sess, user, item, learning_rate=learning_rate, reg_rate=reg_rate, epoch=epoch, batch_size=batch_size, verbose=verbose, T=T, display_step=display_step)

        final_model.build_network(hidden_neuron=hidden_neuron)
        
        if confounder_data is not None and exposure_data is not None:
            final_model.execute(train, test, confounder_data, exposure_data)
        elif confounder_data is not None:
            final_model.execute(train, test, confounder_data)
        else:
            final_model.execute(train, test)

        if vad is not None:
            if confounder_data is not None and exposure_data is not None:
                rmse, mae = final_model.test(test, confounder_data, exposure_data)
            elif confounder_data is not None:
                rmse, mae = final_model.test(test, confounder_data)
            else:
                rmse, mae = final_model.test(test)
        else:
            rmse, mae = None, None

    return rmse, mae

best_params_urec1conf = {
    'learning_rate': best_urec1conf['learning_rate'],
    'reg_rate': [0.001, 0.01, 0.1, 1, 100, 1000][best_urec1conf['reg_rate']],
    'hidden_neuron': [10, 20, 40, 80, 100, 200, 300, 400, 500][best_urec1conf['hidden_neuron']],
    'k': [1, 2, 5, 10, 20, 32, 50, 100][best_urec1conf['k']]
}

print("Best parameters for UAutoRec1conf in detailed form:", best_params_urec1conf)

final_rmse, final_mae = run_model('urec_1_conf', 'UAutoRec1conf', best_params_urec1conf['k'], dat='ml', include_validation=False, use_confounder=True, use_exposure=False, hidden_neuron=best_params_urec1conf['hidden_neuron'], learning_rate=best_params_urec1conf['learning_rate'], reg_rate=best_params_urec1conf['reg_rate'], batch_size=500, epoch=80)

print(f"Final RMSE for UAutoRec1conf: {final_rmse}, Final MAE for UAutoRec1conf: {final_mae}")


Best parameters for UAutoRec1conf in detailed form: {'learning_rate': 0.0026072943828274455, 'reg_rate': 0.001, 'hidden_neuron': 400, 'k': 100}
Train set size: 800169 ratings
Validation set size: 100022 ratings
Test set size: 100022 ratings
1000213
1000213
Load data finished. Number of users: 6040 Number of items: 3706
UAutoRec with Confounder.
Train data processed shape: (3706, 6040)
Confounder data shape: (3706, 6040)


Training:  84%|████████▍ | 67/80 [06:10<01:11,  5.53s/epoch, Loss=2.91e+4, RMSE=0.885, MAE=0.698]


KeyboardInterrupt: 

In [None]:
0l949, 0.946, 0.94, 0,937 33, 28, 25, 18, 14, 13, 07, 03, 0, 97, 94, 93, 9, 87, 88, 886, 883, 882, 88, 79, 78, 77, 76, 75, 74, 76, 78, 79