## import MIND dataset
https://msnews.github.io/
#### download training set
#### use first 100 user-click data set intial matrix
## BEGIN HERE

In [1]:
import gzip
from collections import defaultdict
import scipy
import scipy.optimize
import numpy as np
import random
import pandas as pd

In [2]:
def data_preprocess(pretrain_data):
    # load file
    import csv
    with open(pretrain_data) as file:
        dataset = []
        for line in file:
            fields = line.strip().split('\t')
            header = ['impression_id','user_id','time','history','impressions']
            d = dict(zip(header, fields))
            dataset.append(d)

    dataset_copy = dataset.copy()
    # only need subset of data
    dataset_sub = dataset_copy[:1000]

    # modify value of key history and inpression to be list structure
    for i in range(len(dataset_sub)):
        dataset_sub[i]['history'] = dataset_sub[i]['history'].split()
        dataset_sub[i]['impressions'] = dataset_sub[i]['impressions'].split()

    for i in range(len(dataset_sub)):
        dataset_sub[i]['history'] = [str(x) + '-1' for x in (dataset_sub[i]['history'])]

    # combine history and impression to one key as rate
    # then convert rate value from 'news-click' to {news:click}
    for i in range(len(dataset_sub)):
        dataset_sub[i]['click'] = dataset_sub[i]['history'] + dataset_sub[i]['impressions']
        dataset_sub[i]['click'] = list(s.split('-',1) for s in dataset_sub[i]['click'])
        dataset_sub[i].pop('history', None)
        dataset_sub[i].pop('impressions', None)

    output = []
    for i in range(len(dataset_sub)):
        for u_click in dataset_sub[i]['click']:
            output.append({'user_id': dataset_sub[i]['user_id'], 'time': dataset_sub[i]['time'], 'news_id': u_click[0],'click':int(u_click[1])} )

    # save data into dataframe named 'data'
    data = pd.DataFrame([output[0]], columns=output[0].keys())
    for i in range(1,len(output)):
        data2 = pd.DataFrame([output[i]], columns=output[i].keys())
        data = pd.concat([data,data2],ignore_index=True)


    data['click'] = data['click'] + 1 # that makes access to existing click data more easily
    # from now, 2 means user click the news, 1 means NOT click, 0 means user did not see the news

    return data


In [3]:
data = data_preprocess("D:/backup/wpi/IR/final_project/MINDlarge_train/behaviors.tsv")

In [4]:
data

Unnamed: 0,user_id,time,news_id,click
0,U87243,11/10/2019 11:30:54 AM,N8668,2
1,U87243,11/10/2019 11:30:54 AM,N39081,2
2,U87243,11/10/2019 11:30:54 AM,N65259,2
3,U87243,11/10/2019 11:30:54 AM,N79529,2
4,U87243,11/10/2019 11:30:54 AM,N73408,2
...,...,...,...,...
72971,U150193,11/11/2019 2:43:58 PM,N3075,1
72972,U150193,11/11/2019 2:43:58 PM,N15847,1
72973,U150193,11/11/2019 2:43:58 PM,N38304,1
72974,U150193,11/11/2019 2:43:58 PM,N62017,1


In [13]:
# some news text is empty, delete these news not in base news
import json
json_file = open('word2vec_template/base_data/base_news_title.json', 'r', encoding='utf-8')
aval_news = json.load(json_file)
aval_news_id = list(aval_news.keys())
data = data[data['news_id'].isin(aval_news_id)]

create utility matrix

In [15]:
def create_utility_matrix(df):
    user_list = df['user_id'].unique()
    news_list = df['news_id'].unique()
    
    utility_matrix = np.empty(shape = (len(user_list),len(news_list))) # create an empty 2d np array
    utility_matrix[:] = 0
    
    for i in range(len(df.index)):
        utility_matrix[np.where(user_list==df.iloc[i]['user_id'])[0].item()][np.where(news_list==df.iloc[i]['news_id'])[0].item()] = df.iloc[i]['click']
    
    return utility_matrix,user_list,news_list

In [16]:
utilMat, users_index, items_index = create_utility_matrix(data) # utilMat stores the utility matrix

In [17]:
utilMat

array([[2., 2., 2., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 2., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [18]:
utilMat.shape

(1000, 11917)

train-test split

In [19]:
def train_test_split(ut_matrix): # used for train hyperparameter k
    
    validation = np.zeros(ut_matrix.shape)
    train = ut_matrix.copy() #don't do train=ratings, other wise, ratings becomes empty
    
    for user in np.arange(ut_matrix.shape[0]):
        if len(ut_matrix[user,:].nonzero()[0])>=35:# 35 seems to be best, it depends on sparsity of your user-item matrix
            val_click = np.random.choice(ut_matrix[user, :].nonzero()[0], 
                                        size=15, #tweak this, 15 seems to be optimal
                                        replace=False)
            train[user, val_click] = 0
            validation[user, val_click] = ut_matrix[user, val_click]
    print(validation.shape)
    return train, validation

In [20]:
train, val = train_test_split(utilMat) 

(1000, 11917)


In [21]:
train

array([[0., 2., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

SVD begin: used for matrix factorization initial value

In [22]:
from sklearn.metrics import mean_squared_error
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return np.sqrt(mean_squared_error(prediction, ground_truth))


In [23]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

def svd(train_data,val_data,k,util_mat):
    rmse_list = []
    #get SVD components from train matrix. Choose k.
    for ki in k:
        np.random.seed(0)
        v0 = np.random.rand(min(train_data.shape))
        u, s, vt = svds(train_data, ki,v0=v0) # tweak k, dimensionality for rank matrix
        s_diag_matrix=np.diag(s)
        X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
        rmse_list.append(rmse(X_pred, val_data))
    index_min = min(range(len(rmse_list)), key=rmse_list.__getitem__)
    k_select = k[index_min]

    u, s, vt = svds(util_mat, k_select) # u is the user matrix, s diagonal matrix, vt is news matrix
    s_diag_matrix=np.diag(s) 
    X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

    U = np.dot(u,s_diag_matrix**0.5) # U=u * s**(1/2)
    VT = np.dot(s_diag_matrix**0.5,vt) # VT=s**(1/2) * vt
    
    return k_select,U,VT

In [24]:
k_select,U,VT = svd(train,val,k = [20,25,30,35,40,45,50],util_mat = utilMat)

SGD begin

In [25]:
from scipy.sparse import coo_matrix
R = coo_matrix(utilMat)  # save as sparsity matrix
m,n = R.shape

In [26]:
from numpy.linalg import norm

def error(R,P,Q,lamda=0.02):
    clicks = R.data
    rows = R.row
    cols = R.col
    e = 0 
    for ui in range(len(clicks)):
        rui=clicks[ui]
        u = rows[ui]
        i = cols[ui]
        if rui>0:
            e= e + pow(rui-np.dot(P[u,:],Q[:,i]),2)+\
                lamda*(pow(norm(P[u,:]),2)+pow(norm(Q[:,i]),2))
    return e

In [27]:
error(R,U,VT) # use SVD result as initial value

110658.54471523548

In [28]:
rmse = np.sqrt(error(R,U,VT)/len(R.data))
rmse

1.2714132050016507

In [29]:
def SGD(R, K, P,Q,lamda,steps, gamma):
    
    M,N = R.shape
    
    rmse = np.sqrt(error(R,P,Q,lamda)/len(R.data))
    print("Initial RMSE: "+str(rmse))
    
    for step in range(steps):
        for ui in range(len(R.data)):
            rui=R.data[ui]
            u = R.row[ui]
            i = R.col[ui]
            if rui>0:
                eui=rui-np.dot(P[u,:],Q[:,i])
                P[u,:]=P[u,:]+gamma*2*(eui*Q[:,i]-lamda*P[u,:])
                Q[:,i]=Q[:,i]+gamma*2*(eui*P[u,:]-lamda*Q[:,i])
        rmse = np.sqrt(error(R,P,Q,lamda)/len(R.data))
        if rmse<0.5:
            break
    print("Final RMSE: "+str(rmse))
    return P,Q

In [30]:
# gamma and lamda values can be modifed in order to optimize the model.
#-----------------------------------------------------------------------------------------------------
P,Q=SGD(R,K=k_select,P=U,Q=VT,gamma=0.0005,lamda=0.01, steps=100)

Initial RMSE: 1.2506911604018287
Final RMSE: 0.5694199064871641


In [31]:
P # user matrix

array([[-0.01092597, -0.14966249, -0.16967592, ..., -0.06001381,
        -0.09733089,  0.96176227],
       [ 0.05778768, -0.04900208,  0.3718125 , ..., -0.3111133 ,
        -0.18647092,  1.14312474],
       [-0.29143111,  0.0987592 , -0.29808783, ..., -0.16915252,
         0.11847224,  1.09340551],
       ...,
       [ 0.02457847,  0.05191967, -0.12489554, ...,  0.08151737,
         0.17614906,  0.43227396],
       [ 0.00759658,  0.01266309, -0.12625196, ..., -0.13819188,
        -0.01007413,  0.75739786],
       [-0.145075  , -0.25213425,  0.00821404, ..., -0.50662226,
        -0.05873093,  0.45965129]])

In [32]:
Q # item mateix

array([[ 0.00520234,  0.00321418, -0.10961475, ...,  0.00408928,
         0.00408949, -0.01309988],
       [-0.0147974 , -0.02304979, -0.169636  , ...,  0.00387454,
         0.00387469, -0.02517724],
       [-0.08680287,  0.00205933,  0.02048757, ..., -0.01310765,
        -0.01310804,  0.0036791 ],
       ...,
       [-0.09750177, -0.03794302, -0.03134196, ..., -0.00646889,
        -0.00646891, -0.0406733 ],
       [ 0.03268593, -0.07339252, -0.02741353, ..., -0.00157376,
        -0.00157382, -0.00684737],
       [ 0.65654937,  0.44756753,  0.96895567, ...,  0.03736772,
         0.03736791,  0.03076676]])

In [33]:
q_matrix_pd = pd.DataFrame(Q, columns = items_index)
q_matrix_pd.to_csv('word2vec_template/P_Q_matrix/Item_matrix.csv', index=False)

In [34]:
p_matrix_pd = pd.DataFrame(P, index=users_index)
p_matrix_pd.to_csv('word2vec_template/P_Q_matrix/User_matrix.csv')