## Package

In [None]:
import warnings
# Suppress warning printouts
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise import SVD
import time
import datetime
from utils import train_test_split, df_to_matrix, matrix_to_df_2, set_diff
from surprise.model_selection.split import train_test_split as tr_te_split
from DecisionTreeRecGenreModel import DecisionTreeRecGenreModel
from DecisionTreeRecArtistModel import DecisionTreeRecArtistModel
from DecisionTreeRecModel import DecisionTreeRecModel
from scipy import sparse
import pickle

In [None]:
with open("data/top_items.pkl", "rb") as file:
    top_items_list = pickle.load(file)
with open("data/top_genres.pkl", "rb") as file:
    top_genres_list = pickle.load(file)
print(len(top_items_list))
print(len(top_genres_list))

In [None]:
filtered_data_genre = pd.read_csv('data/filtered_data_genre.csv')
filtered_data_artist = pd.read_csv('data/filtered_data_artist.csv')
filtered_data_track = pd.read_csv('data/filtered_data_track.csv')
print(filtered_data_genre.info())
print(filtered_data_artist.info())
print(filtered_data_track.info())

In [None]:
genre_matrix, grid_to_idx, gidx_to_rid, gcid_to_idx, gidx_to_cid = df_to_matrix(filtered_data_genre,"user_id","genre_id","rating")
print(genre_matrix.shape)

In [None]:
artist_matrix, arid_to_idx, aidx_to_rid, acid_to_idx, aidx_to_cid = df_to_matrix(filtered_data_artist,"user_id","artist_id","rating")
print(artist_matrix.shape)

In [None]:
matrix, rid_to_idx, idx_to_rid, cid_to_idx, idx_to_cid = df_to_matrix(filtered_data_track,"user_id","item_id","rating")
# split to train, X and eval sets
al,train,_ = train_test_split(matrix,1)
X,test,_ = train_test_split(al,10)

In [None]:
predictions_iter,rmse,mae = [],[],[]
# Performance before elicitation
train_df = matrix_to_df_2(train,idx_to_rid,idx_to_cid)
reader = Reader(rating_scale=(0, 100))
data_r = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
data_rr = data_r.build_full_trainset()
algo = SVD()
algo.fit(data_rr)
test_df = matrix_to_df_2(test,idx_to_rid,idx_to_cid)
test_r = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)
test_rr = test_r.build_full_trainset()
test_rrr = test_rr.build_testset()
predictions= algo.test(test_rrr)
rmse.append(accuracy.rmse(predictions))
mae.append(accuracy.mae(predictions))

In [None]:
c_u_dict = {}
all_items = list(range(0,train.shape[1]))
X_matrix = (X.tolil()).copy()
train_copy = (train.tolil()).copy()
print("Data built. Start active learning iterations")

In [None]:
iteration=14
k=20
count=0 
recom_count = 0
for count in range(0,iteration): 
    tstart_time = time.time()
    print("Build the decision tree with training data for iteration:", count+1)
    # Format and print the current time
    print ("Tree Build Start time: ", datetime.datetime.now().strftime('%H:%M:%S'))
    #Build the decision tree with training data
    #dtmodel = DecisionTreeRecGenreModel(train_copy.tocsc(), genre_matrix.tocsc(), top_items_list, top_genres_list, tree_depth_threshold=7)
    dtmodel = DecisionTreeRecModel(train_copy.tocsc(), tree_depth_threshold=7)
    dtmodel.build_model()
    tend_time = time.time()
    print("Time take to build tree: ", tend_time - tstart_time, "seconds")
    print("Start Active learning")
    astart_time = time.time()
    for u in sorted(dtmodel.rU.keys()):
        scores = {}
        if count ==0:
            p_u = train_copy.getrow(u).nonzero()[1]
            #print("p[u]", p_u)
            #c_u = list(set(all_items) - set(p_u))
            c_u = set_diff(all_items,p_u)
            c_u_dict[u] = c_u.copy()
        else:
            c_u = c_u_dict[u].copy()
        #print("Predict rating for user:",u, ", itemCount: ", len(c_u))
        if len(c_u) > 0:
            scores = dtmodel.predict_fast(u, c_u)
            #scores = dtmodel.predict(u, c_u)
            #print_prediction_progress(5000, u+=1, 50)
            ranking_init = sorted(scores.items(),reverse=True, key=lambda x: x[1])
            ranking_u = [i[0] for i in ranking_init]
            topk_u = ranking_u[:k]
            c_u_dict[u] = list(set(c_u) - set(topk_u))
            px_u = X_matrix.getrow(u).nonzero()[1]
            recom = [item for item in topk_u if item in px_u]
            #recom_count += len(recom)
            #print("Len Cu: ", len (c_u),"Recom Count: ", recom_count, "Recom: ", recom)
            counter = 0
            for item in recom:
                #print("Inside update train copy")
                #print("Predicted:", scores[item] ,"X:", X[u,item])
                counter+=1
                train_copy[u,item] = X[u,item]
                X_matrix[u,item] = 0
            #print("Active learning complete for user: ", u)
        else:
            print("Prediction complete as datalist empty")
            break
    aend_time = time.time()
    print("Time take to do Active learning: ", aend_time - astart_time, "seconds")
    print("Retrain the SVD model and re-evaluate the model for iteration:", count+1)
    train_df = matrix_to_df_2(train_copy,idx_to_rid,idx_to_cid)
    reader = Reader(rating_scale=(0, 100))
    data_r = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
    data_rr = data_r.build_full_trainset()
    algo.fit(data_rr)
    #test_df = matrix_to_df_2(test,idx_to_rid,idx_to_cid)
    #test_r = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)
    #test_rr = test_r.build_full_trainset()
    #test_rrr = test_rr.build_testset()
    predictions= algo.test(test_rrr)
    rmse.append(accuracy.rmse(predictions))
    mae.append(accuracy.mae(predictions))
    # Create a DataFrame from the lists
    data = pd.DataFrame({"RMSE": rmse, "MAE": mae})
    # Define the file name where you want to save the data
    file_name = "output/rmse_mae_data.csv"
    # Save the DataFrame to a CSV file
    data.to_csv(file_name, index=False)
    print("Data saved to", file_name)