# Memory-based Algorithm

#### step 1: Load Data 

In [None]:
import pandas as pd
import numpy as np
from load_data1 import load_data1
from load_data2 import load_data2

# load ms data:
ms_train = pd.read_csv('../data/MS_sample/data_train.csv',usecols=range(1,4))
ms_test = pd.read_csv('../data/MS_sample/data_test.csv',usecols=range(1,4))


ms_train_df = load_data1(ms_train) # train in pandas dataframe format 
ms_test_pd =  load_data1(ms_test) # test in pandas dataframe format

ms_train_np=ms_train_df.as_matrix()  # train in numpy matrix format 
ms_test_np=ms_test_pd.as_matrix() # test in numpy matrix format

np.save('../output/train1_matrix',ms_train_np)
np.save('../output/test1_matrix',ms_test_np)

ms_train_df.to_csv('../output/train1_df.csv',header=True,index=True)
ms_test_pd.to_csv('../output/test1_df.csv',header=True,index=True)

# load each_movie data:
movie_train = pd.read_csv('../data/eachmovie_sample/data_train.csv',usecols=["Movie","User","Score"])
movie_test = pd.read_csv('../data/eachmovie_sample/data_test.csv',usecols=["Movie","User","Score"])

train2_df=load_data2(movie_train)
test2_df=load_data2(movie_test)

train2_df.to_csv('../output/train2_df.csv',header=True,index=True)
test2_df.to_csv('../output/test2_df.csv', header=True, index=True)





In [3]:
from pearson_correlation import pearsonSimi1
from pearson_correlation import pearsonSimi2
from vector_similarity import cosineSimi1
from vector_similarity import cosineSimi2
from simrank import simrank

#### step2: Similarity Weight

#### step2.1 Pearson Correlation

In [None]:
from pearson_correlation import pearsonSimi1
from pearson_correlation import pearsonSimi2


pearson_correlation1=pearsonSimi1(train1_df)
pearson_correlation2=pearsonSimi2(train2_df)

pearson_correlation1.to_pickle("../output/pearson_correlation1.pkl")
pearson_correlation2.to_pickle("../output/pearson_correlation2.pkl")


#### step2.2 Vector Similarity

In [None]:
from vector_similarity import cosineSimi1
from vector_similarity import cosineSimi2

cosine_correlation1=cosineSimi1(train1_df)
cosine_correlation2=cosineSimi2(train2_df)

cosine_correlation1.to_pickle("../output/cosine_correlation1.pkl")
cosine_correlation2.to_pickle("../output/cosine_correlation2.pkl")

#### step2.3 Entropy

In [None]:
from entropy import entropySimi1
from entropy import entropySimi2

entropy_correlation1=entropySimi1(train1_df)
entropy_correlation2=entropySimi2(train2_df)

entropy_correlation1.to_pickle("../output/entropy_correlation1.pkl")
entropy_correlation2.to_pickle("../output/entropy_correlation2.pkl")


#### step2.4 Simrank

#### step3 Significance Weighting

We choose pearson correlation for significance weighting

In [None]:
from significance_weighting import significance_weighting

pearson_devalued_correlation1=significance_weighting(train1_df,pearson_correlation1)
pearson_devalued_correlation2=significance_weighting(train2_df,pearson_correlation2)


#### step4 Selecting Neighbors (Best-n-estimator)


In [None]:
from Best_n import selecting_neighborhood


#### step4.1 Neighbors for Pearson 

In [None]:
n=20 # number of neighbors
pearson_weights_neighbor1, pearson_neighbors1=selecting_neighborhood(n, pearson_correlation1, train1_df)
pearson_weights_neighbor2, pearson_neighbors2=selecting_neighborhood(n, pearson_correlation2, train2_df)

#### step4.2 Neighbors for Pearson with Significance Weighting

In [None]:
pearsonSig_weights_neighbor1, pearsonSig_neighbors1=selecting_neighborhood(n, pearson_devalued_correlation1, train1_df)
pearsonSig_weights_neighbor2, pearsonSig_neighbors2=selecting_neighborhood(n, pearson_devalued_correlation2, train2_df)

#### step4.3 Neighbors for Vector Similarity

In [None]:
cosine_weights_neighbor1=selecting_neighborhood(n, cosine_correlation1, train1_df)
cosine_weights_neighbor2=selecting_neighborhood(n, cosine_correlation2, train2_df)

#### step4.4 Neighbors for Entropy

In [None]:
entropy_weights_neighbor1=selecting_neighborhood(n, entropy_correlation1, train1_df)
entropy_weights_neighbor2=selecting_neighborhood(n, entropy_correlation2, train2_df)

#### step4.5 Neighbors for Simrank

#### step5 Rating Normalization

#### step5.1 Deviation for Mean

In [None]:
from Deviation_for_mean import Deviation_for_mean1
from Deviation_for_mean import Deviation_for_mean2

#### step5.1.1 Pearson Prediction

In [None]:
pearson_predictionByDeviation_1=Deviation_for_mean1(train_df1,test_df1,pearson_weights_neighbor1, pearson_neighbors1)
pearson_predictionByDeviation_2=Deviation_for_mean1(train_df2,test_df2,pearson_weights_neighbor2, pearson_neighbors2)

#### step5.1.2  Pearson with Significance Weighting Prediction

In [None]:
pearsonSig_predictionByDeviation_1=Deviation_for_mean1(train_df1,test_df1,pearsonSig_weights_neighbor1, pearsonSig_neighbors1)
pearsonSig_predictionByDeviation_2=Deviation_for_mean1(train_df2,test_df2,pearsonSig_weights_neighbor2, pearsonSig_neighbors2)

#### step5.1.3 Vector Similarity Prediction

In [None]:
cosine_predictionByDeviation_1=Deviation_for_mean1(train_df1,test_df1,cosine_weights_neighbor1, cosine_neighbors1)
cosine_predictionByDeviation_2=Deviation_for_mean1(train_df2,test_df2,cosine_weights_neighbor2, cosine_neighbors2)

#### step5.1.4 Entropy Prediction

In [None]:
entropy_predictionByDeviation_1=Deviation_for_mean1(train_df1,test_df1,cosine_weights_neighbor1, cosine_neighbors1)
entropy_predictionByDeviation_2=Deviation_for_mean1(train_df2,test_df2,cosine_weights_neighbor2, cosine_neighbors2)

#### step5.1.5 Simrank Prediction

In [None]:
entropy_predictionByDeviation_1=Deviation_for_mean1(train_df1,test_df1,cosine_weights_neighbor1, cosine_neighbors1)
entropy_predictionByDeviation_2=Deviation_for_mean1(train_df2,test_df2,cosine_weights_neighbor2, cosine_neighbors2)

#### step5.2 Z-score

In [None]:
from z_score import z_score1
from z_score import z_score2

#### step5.2.1 Pearson Prediction

In [None]:
pearson_predictionByZscore_1=z_score1(train_df1,test_df1,pearson_weights_neighbor1, pearson_neighbors1)
pearson_predictionByZscore_2=z_score2(train_df2,test_df2,pearson_weights_neighbor2, pearson_neighbors2)

#### step5.2.2 Pearson with Significance Weighting Prediction

In [None]:
pearsonSig_predictionByZscore_1=z_score1(train_df1,test_df1,pearson_weights_neighbor1, pearson_neighbors1)
pearsonSig_predictionByZscore_2=z_score2(train_df2,test_df2,pearson_weights_neighbor2, pearson_neighbors2)

#### step5.2.3 Vector Similarity Prediction

In [None]:
cosine_predictionByZscore_1=z_score1(train_df1,test_df1,cosine_weights_neighbor1, cosine_neighbors1)
cosine_predictionByZscore_2=z_score2(train_df2,test_df2,cosine_weights_neighbor2, cosine_neighbors2)

#### step5.2.4 Entropy Prediction

In [None]:
entropy_predictionByZscore_1=z_score1(train_df1,test_df1,entropy_weights_neighbor1, entropy_neighbors1)
entropy_predictionByZscore_2=z_score2(train_df2,test_df2,entropy_weights_neighbor2, entropy_neighbors2)

#### step5.2.5 Simrank Prediction

In [None]:
entropy_predictionByZscore_1=z_score1(train_df1,test_df1,entropy_weights_neighbor1, entropy_neighbors1)
entropy_predictionByZscore_2=z_score2(train_df2,test_df2,entropy_weights_neighbor2, entropy_neighbors2)

#### step6 Evaluation

#### step6.1 Ranked Scoring

#### step6.1.1 Pearson Evaluation

In [None]:
rs_pearson = ranked_score(test_df1, pred_pearson1, d = 0.5, alpha = 5)

#### step6.1.2 Pearson with Significance Weighting Evaluation

In [None]:
rs_sig_pearson = ranked_score(test_df1, pred_sig_pearson1, d = 0.5, alpha = 5)

#### step6.1.3 Vector Similarity Evaluation

In [None]:
rs_cosine = ranked_score(test_df1, pred_cosine1, d = 0.5, alpha = 5)

#### step6.1.4 Entropy Evaluation

In [None]:
rs_entropy = ranked_score(test_df1, pred_entropy1, d = 0.5, alpha = 5)

#### step6.2 Mean Absolute Error 

#### step6.2.1 Pearson Evaluation

In [None]:
mae_pearson = mean_absolute_error(test_df2, pred_pearson2, d = 0.5, alpha = 5)

#### step6.2.2 Pearson with Significance Weighting Evaluation

In [None]:
mae_sig_pearson = mean_absolute_error(test_df2, pred_sig_pearson2, d = 0.5, alpha = 5)

#### step6.2.3 Vector Similarity Evaluation

In [None]:
mae_cosine = mean_absolute_error(test_df2, pred_cosine2, d = 0.5, alpha = 5)

#### step6.2.4 Entropy Evaluation

In [None]:
mae_entropy = mean_absolute_error(test_df2, pred_entropy2, d = 0.5, alpha = 5)

#### step6.2.5 Simrank Evaluation

In [None]:
mae_simrank = mean_absolute_error(test_df2, pred_simrank, d = 0.5, alpha = 5)

#### step6.3 ROC Sensitivity

#### step6.3.1 Pearson Evaluation

In [None]:
roc_pearson = roc_sensitivity(test_df2, pred_pearson2, d = 0.5, alpha = 5)

#### step6.3.2 Pearson with Significance Weighting Evaluation

In [None]:
roc_sig_pearson = roc_sensitivity(test_df2, pred_sig_pearson2, d = 0.5, alpha = 5)

#### step6.3.3 Vector Similarity Evaluation

In [None]:
roc_cosine = roc_sensitivity(test_df2, pred_cosine2, d = 0.5, alpha = 5)

#### step6.3.4 Entropy Evaluation

In [None]:
roc_entropy = roc_sensitivity(test_df2, pred_entropy2, d = 0.5, alpha = 5)

#### step6.3.5 Simrank Evaluation

In [None]:
roc_simrank = roc_sensitivity(test_df2, pred_simrank, d = 0.5, alpha = 5)

# Model-based Algorithm 

#### step 1: load data (MS data is assigned for this algorithm)

In [None]:
import pandas as pd
import numpy as np
from load_data1 import load_data1

ms_train = pd.read_csv('../data/MS_sample/data_train.csv',usecols=range(1,4))
ms_test = pd.read_csv('../data/MS_sample/data_test.csv',usecols=range(1,4))


ms_train_df = load_data1(ms_train) # train in pandas dataframe 
ms_test_pd =  load_data1(ms_test) # test in pandas dataframe

ms_train_np=ms_train_df.as_matrix()  # train in numpy matrix 
ms_test_np=ms_test_pd.as_matrix() # test in numpy matrix


# save data

#np.save('../output/train1_matrix',ms_train_np)
#np.save('../output/test1_matrix',ms_test_np)

#ms_train_df.to_csv('../output/train1_df.csv',header=True,index=True)
#ms_test_pd.to_csv('../output/test1_df.csv',header=True,index=True)

#### step 2: train mixture model using EM algorithm
#### Here we use 3 clusters for example. Later cross-validation is used for choosing the number of clusters

In [None]:
from cluster_model import train_cluster_model
from cluster_model import select_stable_models
import os

X=np.load("../output/train1_matrix.npy")   # train in numpy matrix 
train_df=pd.read_csv("../output/train1_df.csv",index_col=0) # train in pandas dataframe
test_df=pd.read_csv("../output/test1_df.csv",index_col=0) # test in pandas dataframe

"""
X: user matrix 
   each row Xi=[vi1,vi2,...,vin]
   Each row is a users and it records every user's votes

k: number of clusters
"""

# here take k=3 for example:

THETA,A,c=train_cluster_model(X,k) # train mixture model

# save model
filename='cluster_'+str(k)+'_model.npz'
base_dir="../output"
np.savez(os.path.join(base_dir,filename),THETA=THETA,A=A,c=c)

#### step 3: prediction

In [None]:
from part2_prediction_matrix import mixture_model_prediction

mixture_model=np.load("../output/cluster_3_model.npz") # load model
train_df=pd.read_csv("../output/train1_df.csv",index_col=0)
    
THETA=mixture_model['THETA']
A=mixture_model['A'] #assign matrix
prediction_matrix=mixture_model_prediction(A,THETA) # get prediction matrix
    
# transform numpy matrix to pandas matrix

prediction_df=pd.DataFrame(prediction_matrix)
prediction_df.index=train_df.index
prediction_df.columns=train_df.columns
   
prediction_df.to_csv("../output/train1_prediction_df.csv",header=True,index=True)


#### step 4: evaluation

In [None]:
from ranked_scoring import ranked_score

cluster3_score=ranked_score(test_df,prediction_df)
print (cluster3_score)


#### step 5: choose the number of clusters
As shown above, ranked score for k=3 is got. We used this method to get the cluster with highest ranked score.
It should be known that select_stable_models is used here. This function will repeat train_cluster function 10 times and select the model with highest log likelihood. It is because sometimes the model is not the optimal model if it is just trained once. But after 10 times it should be a stable model

In [None]:
# K is a range of number of clusters.
# get the ranked_score for cluster 3 - cluster 20
    K=range(3,21)
    cluster_scores=[]
    for k in K:
        THETA,A,c,log_likelihood=select_stable_models(X,k)
        prediction_df=mixture_model_prediction(A,THETA)
        # transform numpy matrix to pandas dataframe
        prediction_df=pd.DataFrame(prediction_df)
        prediction_df.index=train_df.index
        prediction_df.columns=train_df.columns

        cluster_score=ranked_score(test_df,prediction_df)
        cluster_scores.append(cluster_score)
print(cluster_scores)


The output is :
[81.988519767745032,
 79.850197156108479,
 78.15185123921205,
 70.530292271372431,
 75.295554154425162,
 73.033652276173811,
 68.861695574890973,
 74.153784419504106,
 77.004353981220802,
 70.585141069716869,
 70.879506091305586,
 73.525475762373318,
 72.013816579444793,
 72.687744999995289,
 75.569944504156027,
 72.808830626227945,
 68.287269638291136,
 71.986778485624086]
 
 So we choose k=3 for our model