# Preprocessing data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/USTH/B2/ML2/Recommend System/Video_Games.csv',
                 header = None,
                 names=['userID','productID','Rating','timestamp'])
df.head()

Unnamed: 0,userID,productID,Rating,timestamp
0,439381673,A21ROB4YDOZA5P,1.0,1402272000
1,439381673,A3TNZ2Q5E7HTHD,3.0,1399680000
2,439381673,A1OKRM3QFEATQO,4.0,1391731200
3,439381673,A2XO1JFCNEYV3T,1.0,1391731200
4,439381673,A19WLPIRHD15TH,4.0,1389830400


In [None]:
df.userID.nunique()

71982

In [None]:
df.productID.nunique()

1540618

In [None]:
df.shape

(2565349, 4)

In [None]:
df = df.drop(['timestamp'], axis=1)
df

Unnamed: 0,userID,productID,Rating
0,0439381673,A21ROB4YDOZA5P,1.0
1,0439381673,A3TNZ2Q5E7HTHD,3.0
2,0439381673,A1OKRM3QFEATQO,4.0
3,0439381673,A2XO1JFCNEYV3T,1.0
4,0439381673,A19WLPIRHD15TH,4.0
...,...,...,...
2565344,B01HJEBIAA,ANGB54K3888S4,5.0
2565345,B01HJEBIAA,A3TEVKR0ZVQB2T,5.0
2565346,B01HJEBIAA,ABE7YPWEHNVJZ,5.0
2565347,B01HJEBIAA,A3ES9QBK3G192O,5.0


In [None]:
duplicated = df[['userID', 'productID']].duplicated(keep='last')

In [None]:
df = df[~duplicated]
df

Unnamed: 0,userID,productID,Rating
0,0439381673,A21ROB4YDOZA5P,1.0
1,0439381673,A3TNZ2Q5E7HTHD,3.0
2,0439381673,A1OKRM3QFEATQO,4.0
3,0439381673,A2XO1JFCNEYV3T,1.0
4,0439381673,A19WLPIRHD15TH,4.0
...,...,...,...
2565344,B01HJEBIAA,ANGB54K3888S4,5.0
2565345,B01HJEBIAA,A3TEVKR0ZVQB2T,5.0
2565346,B01HJEBIAA,ABE7YPWEHNVJZ,5.0
2565347,B01HJEBIAA,A3ES9QBK3G192O,5.0


In [None]:
df = df.drop_duplicates(subset=['userID', 'productID'], keep=False)

In [None]:
# items were rated by more than 10 users
product_count = df['productID'].value_counts()
remove = product_count[product_count<20].index
df = df[~df.iloc[:,1].isin(remove)]
df.shape

(121505, 3)

In [None]:
# users with more than 100 ratings
value_count = df['userID'].value_counts()
to_remove = value_count[value_count<20].index
df = df[~df.iloc[:,0].isin(to_remove)]
df.shape

(37846, 3)

In [None]:
# no of user & no of products
n_users = df.userID.nunique()
n_product = df.productID.nunique()
print(str(n_users) + ' users')
print(str(n_product) + ' product')

1044 users
3182 product


In [None]:
# create user-item matrix
new_df = df.pivot(index = 'userID',columns='productID',values='Rating')
new_df

productID,A1027EV8A9PV1O,A103KKI1Y4TFNQ,A105S56ODHGJEK,A10795M6XA74JN,A1084SNUEXOQJY,A10DZNGD2T2SP3,A10JDORKMGFFT0,A10JO6RV8UFYBV,A10JPTQTDNTV43,A10L7QIDE9UF11,...,AZJL03R8NDQ7C,AZKB5T3T2LTE8,AZL8QJZ9XPQH0,AZOD1JRXNWL7G,AZQ7O73EAS3VI,AZQYAXMAC6AJ4,AZVCNOOQK36DH,AZXHK8IO25FL6,AZYU8M791SIFC,AZZ3GFL1HTBLY
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B00000DMAQ,,,,,,,,,,,...,,,,,,,,,,
B00000DMAT,,,,,,,,,,,...,,,,,,,,,,
B00000DMAX,,,,,,,,,,,...,,,,,,,,,,
B00000DMB3,,,,,,,,,,,...,,,,,,,,,,
B00000F1GM,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B01GW3ODBU,,,,,,,,,,,...,,,,,,,,,,
B01GW3OHMK,,,,,,,,,,,...,,,,,,,,,,
B01GW3P6PC,,,,,,,,,,,...,,,,,,,,,,
B01GW3P9PE,,,,,,,,,,,...,,,,,,,,,,


#train test split

In [None]:
from sklearn.model_selection import train_test_split
# train data: 80%
# test data: 20%
train_data, test_data = train_test_split(new_df,test_size=0.2,random_state=0)

# Drop test

In [None]:
def drop_samples(picked_user, group):
 # dataframe of picked_user
  drop2 = group.get_group(picked_user).dropna()
  drop3 = drop2.sample(5)
  return drop3

In [None]:
def remain(picked_user, group):
  drop2 = group.get_group(picked_user)
  drop3 = drop_samples(picked_user, group)
  for i in drop3.index:
    drop2['value'].loc[i] = np.nan
  return drop2

In [None]:
def get_drop_frame(group):
  # drop 10 samples from each user
  # dataframe includes dropped values
  drop_list = []
  for i in test_data.index:
    drop_user = drop_samples(i, group)
    drop_list.append(drop_user)
  drop_frame = pd.concat(drop_list)
  return drop_frame

In [None]:
def create_new_test(drop_frame, group):
  # dataframe includes remain samples
  remain_list = []
  for i in test_data.index:
    remain_user = remain(i, group)
    remain_list.append(remain_user)
  remain_frame = pd.concat(remain_list)

  new_test = remain_frame.pivot(index='userID',columns='productID',values='value')

  return new_test

In [None]:
test_frame = test_data.copy().reset_index().melt(id_vars = 'userID')
group = test_frame.groupby('userID')

In [None]:
drop_frame = get_drop_frame(group)
drop_frame

Unnamed: 0,userID,productID,value
441199,B000A2R54M,A3IBXED14SKZ8Y,5.0
74404,B000A2R54M,A1EVV74UQYVKRY,5.0
174933,B000A2R54M,A201HVK6NWJJER,5.0
559075,B000A2R54M,AELK0E5DK7LIZ,5.0
573287,B000A2R54M,AHCVWPLA1O4X8,5.0
...,...,...,...
242648,B00GANWVJE,A2DDRD1JAGF5Z3,5.0
4179,B00GANWVJE,A11548ZOIDAELN,5.0
506406,B00GANWVJE,A4WVBCQWZ1U97,5.0
231571,B00GANWVJE,A2AY8FVZZOHDQB,4.0


In [None]:
new_test = create_new_test(drop_frame, group)
new_test

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drop2['value'].loc[i] = np.nan


productID,A1027EV8A9PV1O,A103KKI1Y4TFNQ,A105S56ODHGJEK,A10795M6XA74JN,A1084SNUEXOQJY,A10DZNGD2T2SP3,A10JDORKMGFFT0,A10JO6RV8UFYBV,A10JPTQTDNTV43,A10L7QIDE9UF11,...,AZJL03R8NDQ7C,AZKB5T3T2LTE8,AZL8QJZ9XPQH0,AZOD1JRXNWL7G,AZQ7O73EAS3VI,AZQYAXMAC6AJ4,AZVCNOOQK36DH,AZXHK8IO25FL6,AZYU8M791SIFC,AZZ3GFL1HTBLY
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B00000DMAT,,,,,,,,,,,...,,,,,,,,,,
B00000DMAX,,,,,,,,,,,...,,,,,,,,,,
B00000I1BY,,,,,,,,,,,...,,,,,,,,,,
B00000JRSB,,,,,5.0,,,,,,...,,,,,,,,,,5.0
B00000K125,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
B01AC3ZD06,,,,,,,,,,,...,,,,,,,,,,
B01CHU4IY4,,,,,,,,,,,...,,,,,,4.0,,,,
B01F9HMO2K,,,,,,,,,,,...,,,,,,,,,,
B01GD490UM,,,,,,,,,,,...,,,,,,,,,,


#Model class

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import silhouette_score
import seaborn as sns
from sklearn.metrics.pairwise import nan_euclidean_distances

class Hybrid():
    def __init__(self,train_data=train_data,test_data=new_test,drop_frame=drop_frame,cluster=5,similar_user=10,similar_item=5):
        self.train_data = train_data
        self.test_data = test_data
        self.drop_frame = drop_frame
        self.cluster = cluster
        self.similar_user = similar_user
        self.similar_item = similar_item
        self.threshold = 0.3
        self.test_cbr_matrix = test_data
        self.train_cbr_matrix = train_data
        self.train_cluster = cluster
        self.test_cluster = cluster

    # user similarity matrix not include row of picked userid (which we use to compare with the others)
    def picked_userid(self,userid, user_similarity):
        picked_user_similarity = user_similarity.copy()
        # matrix hasn't row of userid
        picked_user_similarity.drop(index=userid, inplace=True)
        picked_user_similarity
        return picked_user_similarity

    # find the top similar users for picked userid
    def top_similar_user(self,picked_user_similarity,userid):
        n = self.similar_user
        # a user must have a Pearson correlation coefficient of at least 0.3 to be considered as a similar user
        similar_users = picked_user_similarity[picked_user_similarity[userid]>self.threshold][userid].sort_values(ascending=False)[:n]
        return similar_users

    # find the weight for similar users
    def weight_similar_user(self,userid, matrix_norm, similar_users):
    # movies that target user has watched
        picked_userid_watched = matrix_norm[matrix_norm.index == userid].dropna(axis=1, how='all')

        # movies that similar user watched
        # remove movies that none of the similar users have watched
        similar_user_movies = matrix_norm[matrix_norm.index.isin(similar_users.index)].dropna(axis=1, how='all')

        # drop the movies that userid watched from the similar user movie list
        # errors='ignore': drops columns if they exist without giving an error message
        similar_user_movies.drop(picked_userid_watched.columns,axis=1, inplace=True, errors='ignore')

        # A dictionary to store item scores
        item_score = {}

        # Loop through items
        for i in similar_user_movies.columns:
            # Get the ratings for movie i
            movie_rating = similar_user_movies[i]
            # Create a variable to store the score
            total = 0
            # Create a variable to store the number of scores
            count = 0
            # Loop through similar users
            for u in similar_users.index:
            # If the movie has rating
                if pd.isna(movie_rating[u]) == False:
                    # Score is the sum of user similarity score multiply by the movie rating
                    score = similar_users[u] * movie_rating[u]
                    # Add the score to the total score for the movie so far
                    total += score
                    # Add 1 to the count
                    count += similar_users[u]
            # Get the average score for the item
            item_score[i] = total / count

        # Convert dictionary to pandas dataframe
        item_score = pd.DataFrame(item_score.items(), columns=['title', 'movie_score'])
        return item_score

    # fill the data by case base reasoning and average filling
    def fill(self,data):
    ########## Normalize user-item matrix
        matrix_norm = data.subtract(data.mean(axis=1), axis = 'rows')
        # range from -1 to 1
        # where -1 means opposite movie preference and 1 means same movie preference

        ########## identify similar users
        # user similarity matrix
        user_similarity = matrix_norm.T.corr()

        cbr_matrix = matrix_norm.copy()
        for userid in matrix_norm.index:
            ########## case base reasoning
            most_similar = self.similar_user
            picked_user_similarity = self.picked_userid(userid, user_similarity)
            # pick top 10 most similar users for userid
            similar_users = self.top_similar_user(picked_user_similarity,userid)
            # weight of similar users
            item_score = self.weight_similar_user(userid, matrix_norm, similar_users)

            # weighted average filling
            for i in range(item_score.shape[0]):
                cbr_matrix[item_score.title[i]].loc[userid] = item_score.movie_score[i]

        ########## average filling for Nan value in cbr_matrix
        # fillna(): calculate the mean() of the particular row
        cbr_matrix = cbr_matrix.T.fillna(cbr_matrix.mean(axis=1)).T
        return cbr_matrix

    # optimal K for KMeans by silhouette score
    # choose max value
    # choose K = 4

    def fit(self):
      self.train_cbr_matrix = self.fill(self.train_data)
      self.test_cbr_matrix = self.fill(self.test_data)

    def silhouette_plot(self):
      silhouette = {}
      for k in range(2,11):
          km = KMeans(n_clusters=k)
          km.fit(self.train_cbr_matrix)
          silhouette[k] = silhouette_score(self.train_cbr_matrix, km.labels_)
      silhouette = sorted(silhouette.items(), key=lambda x:x[1])
      converted_dict = dict(silhouette)
      print(converted_dict)
      self.cluster = list(converted_dict)[-1]
      print('Number of cluster: ', self.cluster)

    def k_mean(self):
      kmeans = KMeans(n_clusters = self.cluster, init = 'k-means++')
      self.train_cluster = kmeans.fit_predict(self.train_cbr_matrix)
      self.test_cluster = kmeans.fit_predict(self.test_cbr_matrix)

    def finding_cluster(self,group):
        #get unique user
        users = np.unique(self.train_cbr_matrix .index)

        #create a dataframe for taking list of userID
        frame = self.train_cbr_matrix .reset_index().melt(id_vars = 'userID')

        #Create a user-cluster dataframe
        users_cluster = pd.DataFrame(np.concatenate((users.reshape(-1,1), self.train_cluster.reshape(-1,1)), axis = 1), columns = ['userID', 'Cluster'])

        #group by cluster
        cluster_group = users_cluster.groupby('Cluster')

        #create a list of user in each cluster
        user_group = cluster_group.get_group(group)
        list_user = list(user_group['userID'])

        #a table of user-product matrix with similarity score
        cluster_frame = frame[frame['userID'].isin(list_user)]
        cluster_frame = cluster_frame.pivot(index = 'userID',columns='productID',values='value')
        item_similarity = cluster_frame.corr()
        return item_similarity

    def finding_cluster_for_a_user_in_test(self,picked_user):

        #list of cluster of test data
        list_test_cluster = list(self.test_cluster)

        #unique user in test set
        users_num = np.unique(self.test_data.index)

        #dataframe of user-cluster in test set
        users_clusters = pd.DataFrame(np.concatenate((users_num.reshape(-1,1), self.test_cluster.reshape(-1,1)), axis = 1), columns = ['userID', 'Cluster'])

        #set index for the df for calling id
        users_clusters2 = users_clusters.set_index('userID')

        #return similarity matrix of the choosen user
        num =  users_clusters2.loc[picked_user]
        for i in num:
            return self.finding_cluster(i)

    # input: 1 user
    # predict unwatched movies and rank
    # output: rank recommended items for user, can choose the number of user
    def recommend_one_user(self,picked_userid): # , item-item
        # list of unwatched movies
        # item_similarity = item-item
        test = self.test_data.copy()
        item_similarity = self.finding_cluster_for_a_user_in_test(picked_userid)

        picked_userid_unwatched = pd.DataFrame(test.loc[picked_userid].isna()).reset_index()
        picked_userid_unwatched = picked_userid_unwatched[picked_userid_unwatched[picked_userid]==True]['productID'].values.tolist()

        # watched movies and ratings
        picked_userid_watched = pd.DataFrame(test.loc[picked_userid].dropna(axis=0, how='all')\
                                    .sort_values(ascending=False))\
                                    .reset_index()\
                                    .rename(columns={picked_userid:'Rating'})


        # Dictionary to save the unwatched movie and predicted rating pair
        rating_prediction ={}

        number_of_similar_items = self.similar_item
        # Loop through unwatched movies
        for picked_movie in picked_userid_unwatched:
            # Calculate the similarity score of the picked movie with other movies
            picked_movie_similarity_score = item_similarity[[picked_movie]].reset_index().rename(columns={picked_movie:'similarity_score'})
            # Rank the similarities between the picked user watched movie and the picked unwatched movie.
            picked_userid_watched_similarity = pd.merge(left=picked_userid_watched,
                                                        right=picked_movie_similarity_score,
                                                        on='productID',
                                                        how='inner')\
                                                .sort_values('similarity_score', ascending=False)[:number_of_similar_items]
            # Calculate the predicted rating using weighted average of similarity scores and the ratings from user 1
            predicted_rating = round(np.average(picked_userid_watched_similarity['Rating'],
                                                weights=picked_userid_watched_similarity['similarity_score']), 6)
            # Save the predicted rating in the dictionary
            rating_prediction[picked_movie] = predicted_rating

        # put all prediction for 1 user into a dataframe
        # prediction = pd.DataFrame.from_dict(rating_prediction, orient = 'index').rename(columns = {0:'score'})
        prediction = pd.DataFrame(rating_prediction.items(), columns = ['productID', 'score'])

        # rank prediction --> best items for user
        prediction = prediction.sort_values(by = 'score', ascending = False)

        return prediction

    def user_rating(self):
      rated_matrix = test_data.copy()
      for picked_userid in test_data.index:
        # find closet cluster
        # item_similarity from that cluster
        # find the rate of each user for unrated item
        prediction = self.recommend_one_user(picked_userid)

        # fill
        for i in range(prediction.shape[0]):
          rated_matrix.loc[picked_userid][prediction.productID[i]] = prediction.score[i]

      return rated_matrix

    def predict(self):
      predicted = []
      test = self.test_data.copy()
      group_drop = self.drop_frame.groupby('userID')
      # tranverse all user in drop_frame
      users = self.drop_frame.userID.unique()
      for picked_user in users:
        # item similarity matrix
        item_similarity = self.finding_cluster_for_a_user_in_test(picked_user)

        # watched movies and ratings
        rated_products = pd.DataFrame(test.loc[picked_user].dropna(axis=0, how='all')\
                                .sort_values(ascending=False))\
                                .reset_index()\
                                .rename(columns={picked_user:'Rating'})

        # dropped product
        unrated_products = group_drop['productID'].apply(list).loc[picked_user]

        number_of_similar_items = self.similar_item
        # Loop through unwatched movies
        for product in unrated_products:
          # Calculate the similarity score of the picked product with other products
          product_similarity_score = item_similarity[[product]].reset_index().rename(columns={product:'similarity_score'})
          # Rank the similarities between the picked user rated product and the unrated product
          picked_userid_rated_similarity = pd.merge(left=rated_products,
                                                    right=product_similarity_score,
                                                    on='productID',
                                                    how='inner')\
                                            .sort_values('similarity_score', ascending=False)[:number_of_similar_items]
          # Calculate the predicted rating using weighted average of similarity scores and the ratings from user 1
          predicted_rating = round(np.average(picked_userid_rated_similarity['Rating'],
                                            weights=picked_userid_rated_similarity['similarity_score']), 6)
          predicted.append(predicted_rating)

      self.drop_frame['predict'] = predicted
      return self.drop_frame

    def score(self):
      df = self.predict()
      # mae score
      score = mean_absolute_error(df.value, df.predict)
      return score

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits = 5)
kf.get_n_splits(new_df)

5

In [None]:
mae = []
for i, (train_index, test_index) in enumerate(kf.split(new_df)):
  print(f'Fold {i}: ')
  # train dataset
  train_data = new_df.iloc[train_index]
  # test dataset
  test_data = new_df.iloc[test_index]
  # drop test
  test_frame = test_data.copy().reset_index().melt(id_vars = 'userID')
  group = test_frame.groupby('userID')

  drop_frame = get_drop_frame(group)
  new_test = create_new_test(drop_frame, group)

  hybrid = Hybrid(train_data,new_test,drop_frame, similar_user = 20, similar_item = 5)
  hybrid.fit()
  hybrid.silhouette_plot()
  hybrid.k_mean()
  s = hybrid.score()
  mae.append(s)
mae

Fold 0: 


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drop2['value'].loc[i] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_user_movies.drop(picked_userid_watched.columns,axis=1, inplace=True, errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_user_movies.drop(picked_userid_watched.columns,axis=1, inplace=True, errors='ignore')


{8: 0.006792836043630959, 9: 0.01562570239363254, 6: 0.018609918424917233, 10: 0.020477966384141, 3: 0.02120767941968342, 7: 0.023384166960699716, 5: 0.02413724554780249, 4: 0.025227178869405006, 2: 0.027238086546670686}
Number of cluster:  2




Fold 1: 


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drop2['value'].loc[i] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_user_movies.drop(picked_userid_watched.columns,axis=1, inplace=True, errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_user_movies.drop(picked_userid_watched.columns,axis=1, inplace=True, errors='ignore')


{9: -0.011268909055748108, 7: 0.0002587454127613108, 6: 0.00410066407977371, 8: 0.01619977441616143, 10: 0.023137811091863643, 4: 0.023385426981508573, 3: 0.02850142789052833, 2: 0.02889156047552906, 5: 0.029330676767830587}
Number of cluster:  5




Fold 2: 


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drop2['value'].loc[i] = np.nan


{9: 0.0008748563215632259, 7: 0.011234015133795296, 10: 0.012811274634430433, 8: 0.017637688983654903, 6: 0.027674021678602244, 3: 0.03545288445619658, 4: 0.037129486529328765, 2: 0.039719084820289595, 5: 0.039885538730057485}
Number of cluster:  5




Fold 3: 


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drop2['value'].loc[i] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_user_movies.drop(picked_userid_watched.columns,axis=1, inplace=True, errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_user_movies.drop(picked_userid_watched.columns,axis=1, inplace=True, errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

{10: 0.0005044835041900443, 8: 0.012589858903790557, 7: 0.012831788822116857, 9: 0.016329949302737033, 6: 0.016666461145643, 5: 0.017920016993691237, 4: 0.03128832755524262, 2: 0.04690935980489496, 3: 0.04879611917236303}
Number of cluster:  3




Fold 4: 


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drop2['value'].loc[i] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_user_movies.drop(picked_userid_watched.columns,axis=1, inplace=True, errors='ignore')


{10: -0.004529926815647921, 9: 0.011518113596722175, 7: 0.012940780694231954, 8: 0.015692679570753692, 4: 0.02369071907264642, 3: 0.03206756864581182, 5: 0.032488952066980936, 6: 0.03450513995989409, 2: 0.03947114303150895}
Number of cluster:  2




[0.4117135990430622,
 0.42150494258373206,
 0.47094143732057414,
 0.35557163636363637,
 0.4003119019230769]

In [None]:
hybrid = Hybrid(train_data,new_test,drop_frame, similar_user = 20, similar_item = 5)
hybrid.fit()
hybrid.silhouette_plot()
hybrid.k_mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_user_movies.drop(picked_userid_watched.columns,axis=1, inplace=True, errors='ignore')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_user_movies.drop(picked_userid_watched.columns,axis=1, inplace=True, errors='ignore')


{9: 0.006900469814549752, 7: 0.007533359143954657, 8: 0.008938061165442547, 10: 0.013001588909749787, 6: 0.019806611633433134, 5: 0.02844680703722518, 2: 0.03214314050632151, 3: 0.03456110994703224, 4: 0.035778585292536726}
Number of cluster:  4




In [None]:
hybrid.recommend_one_user('B000A2R54M')

Unnamed: 0,productID,score
0,A1027EV8A9PV1O,5.000000
2370,A48LVERWAW38V,5.000000
2364,A3WZMVT4ZMAT4,5.000000
1223,A2GPOBDJ70O0TP,5.000000
1222,A2GN82I592BUD6,5.000000
...,...,...
772,A1XIKZOVZA0W67,4.074203
1242,A2HM2RMX1CBZGM,4.065592
354,A1EY1XBSU3PW5Z,4.061361
2736,AJ2OU10N9SXF8,3.945316


In [None]:
from statistics import mean
mean(mae)

0.41200870344681634