In [1]:
import tensorflow as tf
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from tqdm import tqdm
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import os

dir_r3 = 'C:/Users/Sten Stokroos/Desktop/NEW/zelf/Data/out'
dir_ml = 'C:/Users/Sten Stokroos/Desktop/NEW/zelf/Data/out'
randseed = 42

def choose_data(dat, test_size=0.1):
    if dat == 'r3':
        train = pd.read_csv(os.path.join(dir_r3, 'r3_train.csv'), sep="\t", header=None, names=['userId', 'songId', 'rating'], usecols=[0, 1, 2], engine="python")
        test = pd.read_csv(os.path.join(dir_r3, 'r3_test.csv'), sep="\t", header=None, names=['userId', 'songId', 'rating'], usecols=[0, 1, 2], engine="python")
        
        # Combine train and test to create the full dataset
        r3_full = pd.concat([train, test]).sort_values(by=['userId', 'songId']).reset_index(drop=True)
        
        return r3_full, train, test
    elif dat == 'ml':
        ml_full = pd.read_csv(os.path.join(dir_ml, 'ml-1m_full.csv'), sep="\t", header=None, names=['userId', 'songId', 'rating'], usecols=[0, 1, 2], engine="python")
        train, test = train_test_split(ml_full, test_size=test_size, random_state=randseed)
        return ml_full, train, test
    else:
        print('Wrong data input')
        return None, None, None

class UAutoRec():
    def __init__(self, sess, num_user, num_item, learning_rate=0.001, reg_rate=0.1, epoch=20, batch_size=200,
                 verbose=False, T=3, display_step=1000):
        self.learning_rate = learning_rate
        self.epochs = epoch
        self.batch_size = batch_size
        self.reg_rate = reg_rate
        self.sess = sess
        self.num_user = num_user
        self.num_item = num_item
        self.verbose = verbose
        self.T = T
        self.display_step = display_step
        print("UAutoRec with Confounder.")

    def build_network(self, hidden_neuron=500):
        self.rating_matrix = tf.compat.v1.placeholder(dtype=tf.float32, shape=[self.num_item, None])
        self.rating_matrix_mask = tf.compat.v1.placeholder(dtype=tf.float32, shape=[self.num_item, None])
        self.confounder_matrix = tf.compat.v1.placeholder(dtype=tf.float32, shape=[self.num_item, None])

        # Rating path
        V_R = tf.Variable(tf.random.normal([hidden_neuron, self.num_item], stddev=0.01))
        mu_R = tf.Variable(tf.random.normal([hidden_neuron], stddev=0.01))
        layer_1_R = tf.sigmoid(tf.expand_dims(mu_R, 1) + tf.matmul(V_R, self.rating_matrix))
        
        # Confounder path
        V_C = tf.Variable(tf.random.normal([hidden_neuron, self.num_item], stddev=0.01))
        mu_C = tf.Variable(tf.random.normal([hidden_neuron], stddev=0.01))
        layer_1_C = tf.sigmoid(tf.expand_dims(mu_C, 1) + tf.matmul(V_C, self.confounder_matrix))
        
        # Combine paths
        layer_1 = layer_1_R + layer_1_C
        
        # Output layer
        W = tf.Variable(tf.random.normal([self.num_item, hidden_neuron], stddev=0.01))
        b = tf.Variable(tf.random.normal([self.num_item], stddev=0.01))
        self.layer_2 = tf.matmul(W, layer_1) + tf.expand_dims(b, 1)
        self.loss = tf.reduce_mean(tf.square(
            tf.norm(tf.multiply((self.rating_matrix - self.layer_2), self.rating_matrix_mask)))) + self.reg_rate * (
        tf.square(tf.norm(W)) + tf.square(tf.norm(V_R)) + tf.square(tf.norm(V_C)))
        self.optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)

    def train(self, train_data, confounder_data):
        self.num_training = self.num_user
        total_batch = int(self.num_training / self.batch_size)
        idxs = np.random.permutation(self.num_training)  # shuffled ordering

        total_loss = 0
        for i in range(total_batch):
            start_time = time.time()
            if i == total_batch - 1:
                batch_set_idx = idxs[i * self.batch_size:]
            elif i < total_batch - 1:
                batch_set_idx = idxs[i * self.batch_size: (i + 1) * self.batch_size]

            try:
                _, loss = self.sess.run([self.optimizer, self.loss],
                                        feed_dict={self.rating_matrix: self.train_data[:, batch_set_idx],
                                                   self.rating_matrix_mask: self.train_data_mask[:, batch_set_idx],
                                                   self.confounder_matrix: confounder_data[:, batch_set_idx]})
                total_loss += loss
            except IndexError as e:
                print(f"IndexError: {e}")
                print(f"Max index in batch_set_idx: {max(batch_set_idx)}")
                print(f"Train data shape: {self.train_data.shape}")
                print(f"Confounder data shape: {confounder_data.shape}")
                raise

        return total_loss / total_batch

    def test(self, test_data, confounder_data):
        self.reconstruction = self.sess.run(self.layer_2, feed_dict={self.rating_matrix: self.train_data,
                                                                     self.rating_matrix_mask: self.train_data_mask,
                                                                     self.confounder_matrix: confounder_data})
        error = 0
        error_mae = 0
        test_set = list(test_data.keys())
        for (u, i) in test_set:
            pred_rating_test = self.predict(u, i)
            error += (float(test_data.get((u, i))) - pred_rating_test) ** 2
            error_mae += (np.abs(float(test_data.get((u, i))) - pred_rating_test))
        rmse = RMSE(error, len(test_set))
        mae = MAE(error_mae, len(test_set))
        return rmse, mae

    def execute(self, train_data, test_data, confounder_data):
        self.train_data = self._data_process(train_data.transpose())
        self.train_data_mask = np.sign(self.train_data)
        print(f"Train data processed shape: {self.train_data.shape}")
        print(f"Confounder data shape: {confounder_data.shape}")
        init = tf.compat.v1.global_variables_initializer()
        self.sess.run(init)

        with tqdm(total=self.epochs, desc="Training", unit="epoch") as pbar:
            for epoch in range(self.epochs):
                avg_loss = self.train(train_data, confounder_data)
                if (epoch) % self.T == 0:
                    rmse, mae = self.test(test_data, confounder_data)
                    pbar.set_postfix({"Loss": avg_loss, "RMSE": rmse, "MAE": mae})
                pbar.update(1)

    def save(self, path):
        saver = tf.compat.v1.train.Saver()
        saver.save(self.sess, path)

    def predict(self, user_id, item_id):
        if user_id >= self.num_user or item_id >= self.num_item:
            raise IndexError("user_id or item_id out of bounds")
        return self.reconstruction[item_id, user_id]

    def _data_process(self, data):
        output = np.zeros((self.num_item, self.num_user))
        for u in range(self.num_user):
            for i in range(self.num_item):
                output[i, u] = data.get((i, u), 0)  # Use .get() with a default value of 0
        return output

def RMSE(error, num):
    return np.sqrt(error / num)

def MAE(error_mae, num):
    return (error_mae / num)

def load_data_rating(dat, columns=[0, 1, 2], sep="\t"):
    full, train, test = choose_data(dat, test_size= 0.1)

    
    train, vad =  train_test_split(train, test_size=0.1, random_state=42)#pd.read_csv(train_file, sep=sep, header=None, names=['userId', 'itemId', 'rating'], usecols=columns, engine="python")
    
    n_users = max(train['userId'].max(), test['userId'].max()) + 1
    n_items = max(train['songId'].max(), test['songId'].max()) + 1

    train_row = []
    train_col = []
    train_rating = []

    for line in train.itertuples():
        u = line[1]
        i = line[2]
        train_row.append(u)
        train_col.append(i)
        train_rating.append(line[3])

    train_matrix = csr_matrix((train_rating, (train_row, train_col)), shape=(n_users, n_items))

    test_row = []
    test_col = []
    test_rating = []
    for line in test.itertuples():
        u = line[1]
        i = line[2]
        test_row.append(u)
        test_col.append(i)
        test_rating.append(line[3])

    test_matrix = csr_matrix((test_rating, (test_row, test_col)), shape=(n_users, n_items))

    vd_row = []
    vd_col = []
    vd_rating = []
    for line in vad.itertuples():
        u = line[1]
        i = line[2]
        vd_row.append(u)
        vd_col.append(i)
        vd_rating.append(line[3])

    vd_matrix = csr_matrix((vd_rating, (vd_row,vd_col)), shape=(n_users, n_items))

    print("Load data finished. Number of users:", n_users, "Number of items:", n_items)
    return train_matrix.todok(), test_matrix.todok(), vd_matrix.todok(), n_users, n_items


# train, test, vad, user, item = load_data_rating('r3', columns=[0, 1, 2], sep="\t")

CAUSEFIT_DIR = 'C:/Users/Sten Stokroos/Desktop/NEW/zelf/Data/exposure_output/ml_exp_k_30_050.csv'

conf_df = pd.read_csv(CAUSEFIT_DIR, header=None)

# Convert the DataFrame to a NumPy array
confounder_data = conf_df.to_numpy()
confounder_data = confounder_data.T



In [2]:
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True

def objective(params):
    learning_rate = params['learning_rate']
    reg_rate = params['reg_rate']
    hidden_neuron = params['hidden_neuron']
    batch_size = params['batch_size']

    with tf.compat.v1.Session(config=config) as sess:
        model = UAutoRec(sess, user, item, learning_rate=learning_rate, reg_rate=reg_rate, epoch=20, batch_size=batch_size, verbose=True)
        model.build_network(hidden_neuron=hidden_neuron)
        model.execute(train, vad, confounder_data)
        
        # Evaluate the model on the validation set
        rmse, mae = model.test(vad, confounder_data)
        return {'loss': rmse, 'status': STATUS_OK}

space = {
    'learning_rate': hp.uniform('learning_rate', 0.0001, 0.01),
    'reg_rate': hp.uniform('reg_rate', 0.0001, 0.01),
    'hidden_neuron': hp.choice('hidden_neuron', [100, 250, 500]),
    'batch_size': hp.choice('batch_size', [100, 200, 400])
}

# Run Hyperopt
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=5, trials=trials)

print("Best parameters:", best)

best_params = {
    'learning_rate': best['learning_rate'],
    'reg_rate': best['reg_rate'],
    'hidden_neuron': [100, 200, 500][best['hidden_neuron']],
    'batch_size': [100, 200, 500][best['batch_size']]
}




UAutoRec with Confounder.                            
Train data processed shape: (3706, 6040)             
Confounder data shape: (3706, 6040)                  
  0%|          | 0/5 [01:59<?, ?trial/s, best loss=?]

Training:   0%|          | 0/20 [00:00<?, ?epoch/s]
Training:   0%|          | 0/20 [00:06<?, ?epoch/s, Loss=1.8e+5, RMSE=1.37, MAE=1.07]
Training:   5%|5         | 1/20 [00:06<02:10,  6.89s/epoch, Loss=1.8e+5, RMSE=1.37, MAE=1.07]
Training:  10%|#         | 2/20 [00:10<01:24,  4.72s/epoch, Loss=1.8e+5, RMSE=1.37, MAE=1.07]
Training:  15%|#5        | 3/20 [00:13<01:08,  4.02s/epoch, Loss=1.8e+5, RMSE=1.37, MAE=1.07]
Training:  15%|#5        | 3/20 [00:19<01:08,  4.02s/epoch, Loss=5.68e+4, RMSE=1.02, MAE=0.817]
Training:  20%|##        | 4/20 [00:19<01:19,  4.98s/epoch, Loss=5.68e+4, RMSE=1.02, MAE=0.817]
Training:  25%|##5       | 5/20 [00:22<01:05,  4.34s/epoch, Loss=5.68e+4, RMSE=1.02, MAE=0.817]
Training:  30%|###       | 6/20 [00:26<00:55,  3.96s/epoch, Loss=5.68e+4, RMSE=1.02, MAE=0.817]
Training:  30%|###       | 6/20 [00:33<00:55,  3.96s/epoch, Loss=5.39e+4, RMSE=1.01, MAE=0.799]
Training:  35%|###5      | 7/20 [00:33<01:06,  5.14s/epoch, Loss=5.39e+4, RMSE=1.01, MAE=0.799]
Trai

UAutoRec with Confounder.                                                       
Train data processed shape: (3706, 6040)                                        
Confounder data shape: (3706, 6040)                                             
 20%|██        | 1/5 [05:29<13:57, 209.43s/trial, best loss: 0.8983660924581586]

Training:   0%|          | 0/20 [00:00<?, ?epoch/s]
Training:   0%|          | 0/20 [00:06<?, ?epoch/s, Loss=2.51e+4, RMSE=1.07, MAE=0.842]
Training:   5%|5         | 1/20 [00:06<01:54,  6.04s/epoch, Loss=2.51e+4, RMSE=1.07, MAE=0.842]
Training:  10%|#         | 2/20 [00:09<01:23,  4.66s/epoch, Loss=2.51e+4, RMSE=1.07, MAE=0.842]
Training:  15%|#5        | 3/20 [00:13<01:12,  4.26s/epoch, Loss=2.51e+4, RMSE=1.07, MAE=0.842]
Training:  15%|#5        | 3/20 [00:19<01:12,  4.26s/epoch, Loss=1.39e+4, RMSE=1.01, MAE=0.801]
Training:  20%|##        | 4/20 [00:19<01:18,  4.93s/epoch, Loss=1.39e+4, RMSE=1.01, MAE=0.801]
Training:  25%|##5       | 5/20 [00:23<01:06,  4.42s/epoch, Loss=1.39e+4, RMSE=1.01, MAE=0.801]
Training:  30%|###       | 6/20 [00:26<00:57,  4.11s/epoch, Loss=1.39e+4, RMSE=1.01, MAE=0.801]
Training:  30%|###       | 6/20 [00:32<00:57,  4.11s/epoch, Loss=1.22e+4, RMSE=0.962, MAE=0.766]
Training:  35%|###5      | 7/20 [00:32<01:00,  4.66s/epoch, Loss=1.22e+4, RMSE=0.962, MAE=0

UAutoRec with Confounder.                                                       
Train data processed shape: (3706, 6040)                                        
Confounder data shape: (3706, 6040)                                             
 40%|████      | 2/5 [09:07<10:36, 212.16s/trial, best loss: 0.8983660924581586]

Training:   0%|          | 0/20 [00:00<?, ?epoch/s]
Training:   0%|          | 0/20 [00:04<?, ?epoch/s, Loss=6.93e+4, RMSE=1.06, MAE=0.829]
Training:   5%|5         | 1/20 [00:04<01:31,  4.82s/epoch, Loss=6.93e+4, RMSE=1.06, MAE=0.829]
Training:  10%|#         | 2/20 [00:07<00:59,  3.31s/epoch, Loss=6.93e+4, RMSE=1.06, MAE=0.829]
Training:  15%|#5        | 3/20 [00:09<00:48,  2.83s/epoch, Loss=6.93e+4, RMSE=1.06, MAE=0.829]
Training:  15%|#5        | 3/20 [00:14<00:48,  2.83s/epoch, Loss=2.74e+4, RMSE=1.01, MAE=0.804]
Training:  20%|##        | 4/20 [00:14<01:00,  3.76s/epoch, Loss=2.74e+4, RMSE=1.01, MAE=0.804]
Training:  25%|##5       | 5/20 [00:16<00:48,  3.25s/epoch, Loss=2.74e+4, RMSE=1.01, MAE=0.804]
Training:  30%|###       | 6/20 [00:19<00:40,  2.92s/epoch, Loss=2.74e+4, RMSE=1.01, MAE=0.804]
Training:  30%|###       | 6/20 [00:23<00:40,  2.92s/epoch, Loss=2.71e+4, RMSE=1.01, MAE=0.806]
Training:  35%|###5      | 7/20 [00:23<00:45,  3.48s/epoch, Loss=2.71e+4, RMSE=1.01, MAE=0.8

UAutoRec with Confounder.                                                       
Train data processed shape: (3706, 6040)                                        
Confounder data shape: (3706, 6040)                                             
 60%|██████    | 3/5 [12:25<06:45, 202.54s/trial, best loss: 0.8983660924581586]

Training:   0%|          | 0/20 [00:00<?, ?epoch/s]
Training:   0%|          | 0/20 [00:08<?, ?epoch/s, Loss=4.13e+4, RMSE=1.05, MAE=0.845]
Training:   5%|5         | 1/20 [00:08<02:44,  8.66s/epoch, Loss=4.13e+4, RMSE=1.05, MAE=0.845]
Training:  10%|#         | 2/20 [00:14<02:03,  6.84s/epoch, Loss=4.13e+4, RMSE=1.05, MAE=0.845]
Training:  15%|#5        | 3/20 [00:19<01:46,  6.25s/epoch, Loss=4.13e+4, RMSE=1.05, MAE=0.845]
Training:  15%|#5        | 3/20 [00:28<01:46,  6.25s/epoch, Loss=1.36e+4, RMSE=1.01, MAE=0.806]
Training:  20%|##        | 4/20 [00:28<01:56,  7.28s/epoch, Loss=1.36e+4, RMSE=1.01, MAE=0.806]
Training:  25%|##5       | 5/20 [00:34<01:42,  6.85s/epoch, Loss=1.36e+4, RMSE=1.01, MAE=0.806]
Training:  30%|###       | 6/20 [00:40<01:30,  6.45s/epoch, Loss=1.36e+4, RMSE=1.01, MAE=0.806]
Training:  30%|###       | 6/20 [00:49<01:30,  6.45s/epoch, Loss=1.33e+4, RMSE=0.993, MAE=0.794]
Training:  35%|###5      | 7/20 [00:49<01:36,  7.44s/epoch, Loss=1.33e+4, RMSE=0.993, MAE=0

UAutoRec with Confounder.                                                       
Train data processed shape: (3706, 6040)                                        
Confounder data shape: (3706, 6040)                                             
 80%|████████  | 4/5 [17:00<03:51, 231.63s/trial, best loss: 0.8983660924581586]

Training:   0%|          | 0/20 [00:00<?, ?epoch/s]
Training:   0%|          | 0/20 [00:08<?, ?epoch/s, Loss=2.41e+4, RMSE=1.09, MAE=0.852]
Training:   5%|5         | 1/20 [00:08<02:32,  8.00s/epoch, Loss=2.41e+4, RMSE=1.09, MAE=0.852]
Training:  10%|#         | 2/20 [00:13<01:57,  6.52s/epoch, Loss=2.41e+4, RMSE=1.09, MAE=0.852]
Training:  15%|#5        | 3/20 [00:18<01:43,  6.06s/epoch, Loss=2.41e+4, RMSE=1.09, MAE=0.852]
Training:  15%|#5        | 3/20 [00:27<01:43,  6.06s/epoch, Loss=1.32e+4, RMSE=0.979, MAE=0.783]
Training:  20%|##        | 4/20 [00:27<01:50,  6.91s/epoch, Loss=1.32e+4, RMSE=0.979, MAE=0.783]
Training:  25%|##5       | 5/20 [00:32<01:36,  6.40s/epoch, Loss=1.32e+4, RMSE=0.979, MAE=0.783]
Training:  30%|###       | 6/20 [00:38<01:27,  6.27s/epoch, Loss=1.32e+4, RMSE=0.979, MAE=0.783]
Training:  30%|###       | 6/20 [00:47<01:27,  6.27s/epoch, Loss=1.08e+4, RMSE=0.923, MAE=0.736]
Training:  35%|###5      | 7/20 [00:47<01:33,  7.17s/epoch, Loss=1.08e+4, RMSE=0.923, M

100%|██████████| 5/5 [19:08<00:00, 229.71s/trial, best loss: 0.8983660924581586]
Best parameters: {'batch_size': 2, 'hidden_neuron': 2, 'learning_rate': 0.005181234156732431, 'reg_rate': 0.0033614508223386813}


In [4]:
def load_data_rating(dat, columns=[0, 1, 2], sep="\t"):
    full, train, test = choose_data(dat, test_size= 0.1)

    
    # train, vad =  train_test_split(train_df, test_size=0.1, random_state=42)#pd.read_csv(train_file, sep=sep, header=None, names=['userId', 'itemId', 'rating'], usecols=columns, engine="python")
    
    n_users = max(train['userId'].max(), test['userId'].max()) + 1
    n_items = max(train['songId'].max(), test['songId'].max()) + 1

    train_row = []
    train_col = []
    train_rating = []

    for line in train.itertuples():
        u = line[1]
        i = line[2]
        train_row.append(u)
        train_col.append(i)
        train_rating.append(line[3])

    train_matrix = csr_matrix((train_rating, (train_row, train_col)), shape=(n_users, n_items))

    test_row = []
    test_col = []
    test_rating = []
    for line in test.itertuples():
        u = line[1]
        i = line[2]
        test_row.append(u)
        test_col.append(i)
        test_rating.append(line[3])

    test_matrix = csr_matrix((test_rating, (test_row, test_col)), shape=(n_users, n_items))

    # vd_row = []
    # vd_col = []
    # vd_rating = []
    # for line in vad.itertuples():
    #     u = line[1]
    #     i = line[2]
    #     vd_row.append(u)
    #     vd_col.append(i)
    #     vd_rating.append(line[3])

    # vd_matrix = csr_matrix((vd_rating, (vd_row,vd_col)), shape=(n_users, n_items))

    print("Load data finished. Number of users:", n_users, "Number of items:", n_items)
    return train_matrix.todok(), test_matrix.todok(), n_users, n_items


train, test, user, item = load_data_rating('ml', columns=[0, 1, 2], sep="\t")

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True

CAUSEFIT_DIR = 'C:/Users/Sten Stokroos/Desktop/NEW/zelf/Data/exposure_output/ml_exp_k_30.csv'

conf_df = pd.read_csv(CAUSEFIT_DIR, header=None)

# Convert the DataFrame to a NumPy array
confounder_data = conf_df.to_numpy()
confounder_data = confounder_data.T

# Train the final model on the entire training data
with tf.compat.v1.Session(config=config) as sess:
    final_model = UAutoRec(sess, user, item,  learning_rate=0.001, reg_rate=0.1, epoch=80, batch_size=500, verbose=True)#learning_rate=best_params['learning_rate'], reg_rate=best_params['reg_rate'], epoch =50 , batch_size=best_params['batch_size'], verbose=True)
    final_model.build_network(hidden_neuron= 500) #best_params['hidden_neuron'])
    final_model.execute(train, test, confounder_data)

    # Evaluate the final model on the test set
    final_rmse, final_mae = final_model.test(test, confounder_data)
    print(f"Final RMSE: {final_rmse}, Final MAE: {final_mae}")

Load data finished. Number of users: 6040 Number of items: 3706
UAutoRec with Confounder.
Train data processed shape: (3706, 6040)
Confounder data shape: (3706, 6040)


Training: 100%|██████████| 80/80 [05:42<00:00,  4.28s/epoch, Loss=3.96e+4, RMSE=0.871, MAE=0.689]


Final RMSE: 0.8713429436450515, Final MAE: 0.6891787481360501
