In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics.pairwise import paired_distances,cosine_similarity

In [3]:
articles = pd.read_csv(r"Article_test1.csv")
articles.drop(columns="genres",inplace=True)
df = pd.read_csv(r"Article_Rating_test1.csv")
df = pd.merge(df, articles, on='ArticleId')

In [4]:
def find_common_article(user1,user2):
    """找尋兩個uesr共同觀看過文章"""
    s1 = set((df.loc[df["userId"]==user1,"ArticleId"].values))
    s2 = set((df.loc[df["userId"]==user2,"ArticleId"].values))
    return s1.intersection(s2)

In [5]:
def cosine_similarity(vec1, vec2):
    """
    計算兩個向量之間的餘弦相似性
    :param vec1: 向量 a 
    :param vec2: 向量 b
    :return: sim
    """
    vec1 = np.mat(vec1)
    vec2 = np.mat(vec2)
    num = float(vec1 * vec2.T)
    denom = np.linalg.norm(vec1) * np.linalg.norm(vec2)
    cos = num / denom
    sim = 0.5 + 0.5 * cos
    return sim

In [6]:
def cal_user_similarity_with_article_rating(user1,user2,Article_id):
    """計算兩個user對於特定文章評分的相似度"""
    u1 = df[df["userId"]==user1]
    u2 = df[df["userId"]==user2]
    vec1 = u1[u1.ArticleId.isin(Article_id)].sort_values(by="ArticleId")["Rating"].values
    vec2 = u2[u2.ArticleId.isin(Article_id)].sort_values(by="ArticleId")["Rating"].values
    return cosine_similarity(vec1, vec2)

In [7]:
def recommend(user,num):
    #find similarity between user and other uesr.
    user_similarity = [] 
    for other_user in df.userId.unique():
        if other_user == user:
            continue
        #print ("other user :",other_user)
        common_articles = find_common_article(user,other_user)
        sim = cal_user_similarity_with_article_rating(user,other_user,common_articles)
        user_similarity.append([other_user,sim])
    
    #find top 10 similar user  
    user_similarity = np.array(user_similarity)
    sorted_index = np.argsort(user_similarity, axis=0)[:,1][::-1][:10] 
    top10_similar_user = user_similarity[:,0][sorted_index]   
    
    #find the articles the user haven't seen
    seen_articles = df.loc[df["userId"]==user,"ArticleId"].values
    not_seen_articles = defaultdict(list) 
    for similar_user in top10_similar_user:
        articles = df.loc[df.userId==similar_user,["ArticleId","Rating"]].values.tolist()
        if isinstance(articles[0], list):   
            for a in articles:
                if a[0] in seen_articles:
                    continue
                not_seen_articles[a[0]].append(a[1])
                
        elif articles[0] not in seen_articles:
            print("here", articles[0], articles[1])
            not_seen_articles[articles[0]].append(articles[1])
                
    #average same movie rating
    for a in not_seen_articles:
        not_seen_articles[a] = np.mean(not_seen_articles[a])
    
    #get top 10 ratings by sorting it 
    #not_seen_articles.items() = [(articleID, [meanRating]), (articleID, [meanRating])]
    top10_rating = sorted(not_seen_articles.items(), key=lambda x: x[1], reverse=True)   

    return [article for article,rating in top10_rating][:num]



In [8]:
top10_article = recommend(1,5)
top10_article 

  if sys.path[0] == '':


[215, 70, 111, 31, 196]