In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics.pairwise import paired_distances,cosine_similarity

In [2]:
articles = pd.read_csv(r"D:\JupyterNotebook\Roommate\文章標籤興趣程度研究計畫回應\Article_test1.csv")
articles.drop(columns="genres",inplace=True)
df = pd.read_csv(r"D:\JupyterNotebook\Roommate\文章標籤興趣程度研究計畫回應\Article_Rating_test1.csv")
df = pd.merge(df, articles, on='ArticleId')

In [3]:
def find_common_article(user1,user2):
    """找尋兩個uesr共同觀看過文章"""
    s1 = set((df.loc[df["userId"]==user1,"ArticleId"].values))
    s2 = set((df.loc[df["userId"]==user2,"ArticleId"].values))
    return s1.intersection(s2)

In [4]:
def cosine_similarity(vec1, vec2):
    """
    計算兩個向量之間的餘弦相似性
    :param vec1: 向量 a 
    :param vec2: 向量 b
    :return: sim
    """
    vec1 = np.mat(vec1)
    vec2 = np.mat(vec2)
    num = float(vec1 * vec2.T)
    denom = np.linalg.norm(vec1) * np.linalg.norm(vec2)
    cos = num / denom
    sim = 0.5 + 0.5 * cos
    return sim

In [5]:
def cal_user_similarity_with_article_rating(user1,user2,Article_id):
    """計算兩個user對於特定文章評分的相似度"""
    u1 = df[df["userId"]==user1]
    u2 = df[df["userId"]==user2]
    vec1 = u1[u1.ArticleId.isin(Article_id)].sort_values(by="ArticleId")["Rating"].values
    vec2 = u2[u2.ArticleId.isin(Article_id)].sort_values(by="ArticleId")["Rating"].values
    return cosine_similarity(vec1, vec2)

In [23]:
def recommend(user,num):
    #find similarity between user and other uesr.
    user_similarity = [] 
    for other_user in df.userId.unique():
        if other_user == user:
            continue
        #print ("other user :",other_user)
        common_articles = find_common_article(user,other_user)
        sim = cal_user_similarity_with_article_rating(user,other_user,common_articles)
        user_similarity.append([other_user,sim])
    
    #find top 10 similar user  
    user_similarity = np.array(user_similarity)
    sorted_index = np.argsort(user_similarity, axis=0)[:,1][::-1][:10]  #[:,1] 全部的sim    ([::-1] 逆序输出,连续输出)
    top10_similar_user = user_similarity[:,0][sorted_index]    #[:,0] 全部的user
    
    #find the articles the user haven't seen
    seen_articles = df.loc[df["userId"]==user,"ArticleId"].values
    not_seen_articles = defaultdict(list) 
    for similar_user in top10_similar_user:
        articles = df.loc[df.userId==similar_user,["ArticleId","Rating"]].values.tolist()
        if isinstance(articles[0], list):    #isinstance() 函数来判断一个对象是否是一个已知的类型，类似 type()
            for a in articles:
                if a[0] in seen_articles:
                    continue
                not_seen_articles[a[0]].append(a[1]) #{articleID : [rating, rating]}
                
        elif articles[0] not in seen_articles:
            print("here", articles[0], articles[1])
            not_seen_articles[articles[0]].append(articles[1])
                
    #average same movie rating
    for a in not_seen_articles:
        not_seen_articles[a] = np.mean(not_seen_articles[a])
    
    #get top 10 ratings by sorting it 
    #not_seen_articles.items() = [(articleID, [meanRating]), (articleID, [meanRating])]
    top10_rating = sorted(not_seen_articles.items(), key=lambda x: x[1], reverse=True)   # Key = meanRating 

    return [article for article,rating in top10_rating][:num]



In [24]:
top10_article = recommend(1,5)
top10_article 

  if sys.path[0] == '':


[(215, 7.0), (70, 7.0), (111, 7.0), (31, 7.0), (196, 7.0), (49, 7.0), (23, 7.0), (223, 7.0), (92, 6.0), (42, 6.0), (101, 6.0), (106, 6.0), (16, 6.0), (45, 6.0), (27, 6.0), (73, 6.0), (203, 6.0), (96, 6.0), (44, 5.5), (138, 5.5), (140, 5.0), (63, 5.0), (47, 5.0), (146, 5.0), (115, 5.0), (169, 5.0), (149, 5.0), (15, 5.0), (125, 5.0), (105, 5.0), (129, 5.0), (144, 4.5), (24, 4.5), (220, 4.5), (99, 4.5), (14, 4.0), (222, 4.0), (39, 4.0), (4, 4.0), (180, 3.5), (71, 3.0), (175, 3.0), (124, 3.0), (157, 3.0), (53, 3.0), (121, 3.0), (110, 2.5), (75, 2.0), (89, 2.0), (216, 2.0), (84, 2.0), (204, 2.0), (120, 2.0), (93, 2.0), (148, 2.0), (128, 1.0), (87, 1.0)]


[215, 70, 111, 31, 196]

In [13]:
user_similarity = [] 
for other_user in df.userId.unique():
        if other_user == 1:
            continue
        #print ("other user :",other_user)
        common_articles = find_common_article(1,other_user)
        sim = cal_user_similarity_with_article_rating(1,other_user,common_articles)
        user_similarity.append([other_user,sim])
user_similarity = np.array(user_similarity)

  if sys.path[0] == '':


In [14]:
user_similarity = np.array(user_similarity)
sorted_index = np.argsort(user_similarity, axis=0)[:,1][::-1][:10]
top10_similar_user = user_similarity[:,0][sorted_index]
seen_articles = df.loc[df["userId"]==1,"ArticleId"].values
not_seen_articles = defaultdict(list) 
for similar_user in top10_similar_user:
    articles = df.loc[df.userId==similar_user,["ArticleId","Rating"]].values.tolist()
    if isinstance(articles[0], list):    #isinstance() 函数来判断一个对象是否是一个已知的类型，类似 type()
        for a in articles:
            if a[0] in seen_articles:
                continue
            not_seen_articles[a[0]].append(a[1]) #{articleID : rating}
    elif articles[0] not in seen_articles:
        print("here", articles[0], articles[1])
        not_seen_articles[articles[0]].append(articles[1])

In [21]:
not_seen_articles[101] = np.mean(not_seen_articles[101])

In [32]:
not_seen_articles.items()

dict_items([(140, [5]), (92, [6]), (75, [2]), (89, [2]), (42, [7, 5]), (101, 6.0), (215, [7]), (63, [5]), (44, [4, 7]), (144, [5, 4]), (106, [6]), (70, [7]), (47, [5]), (16, [6]), (45, [6]), (24, [3, 6]), (71, [3]), (146, [5]), (216, [2]), (115, [5]), (175, [3]), (110, [4, 1]), (138, [4, 7]), (124, [3, 3]), (27, [6]), (14, [5, 3, 4]), (180, [2, 5]), (111, [7]), (84, [2]), (169, [5]), (128, [1]), (204, [2]), (73, [6]), (220, [5, 4]), (87, [1]), (149, [5]), (157, [3]), (222, [4]), (203, [6]), (39, [4]), (31, [7]), (196, [7]), (49, [7]), (99, [7, 2]), (23, [7]), (223, [7]), (4, [4]), (120, [2]), (15, [5]), (53, [3]), (125, [5]), (93, [2]), (105, [5]), (148, [2]), (96, [6]), (121, [3]), (129, [5]), (1, [])])

In [None]:
0