# 餘弦計算

In [152]:
import numpy as np
import pandas as pd
import pyodbc

from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr

In [28]:
# 一般資料
a = [1,2,3,4,5]
b = [3,4,3,6,9]

print('Type = {}, Len = {}'.format(type(a), len(a)))
print('Type = {}, Len = {}'.format(type(b), len(b)))

print(cosine_similarity(a,b))

# 程式無法執行的原因在於：Expected 2D array, got 1D array instead
# a, b 二個各自的元素，因為是存於list資料結構下的第一層，所以有各有5個元素
# 需將各自資料結構內的元素，變成「一組」才能計算…


Type = <class 'list'>, Len = 5
Type = <class 'list'>, Len = 5


ValueError: Expected 2D array, got 1D array instead:
array=[1. 2. 3. 4. 5.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [29]:
# 一般資料-修正
a = [1,2,3,4,5]
b = [3,4,3,6,9]

a = [a] # 將 a 的元素，變成一個list，資料下的一組
b = [b] # 將 b 的元素，變成一個list，資料下的一組

print('Type = {}, Len = {}'.format(type(a), len(a)))
print('Type = {}, Len = {}'.format(type(b), len(b)))

print(cosine_similarity(a,b))

# 程式無法執行的原因在於：Expected 2D array, got 1D array instead
# a, b 二個各自的元素，因為是存於list資料結構下的第一層，所以有各有5個元素
# 需將各自資料結構內的元素，變成「一組」才能計算…

Type = <class 'list'>, Len = 1
Type = <class 'list'>, Len = 1
[[0.9766078]]


In [37]:
# 一般資料 & array 差異

a_list = [1, 3, 5]
a_array = np.array([1, 3, 5])

print(a_list, a_array)
print(type(a_list), type(a_array))

[1, 3, 5] [1 3 5]
<class 'list'> <class 'numpy.ndarray'>


# 電影資料實測

In [45]:
conn = pyodbc.connect('DRIVER={SQL Server}; SERVER=(local); DATABASE=MLDATASET;')

In [41]:
SqlStr = 'SELECT * FROM MOVIE_SUMMARY'

In [46]:
movie = pd.read_sql(sql=SqlStr, con=conn)

In [48]:
df = movie.copy()

In [49]:
df.head()

Unnamed: 0,userId,ACTION,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film,Horror,Musical,Mystery,Romance,Sci,Thriller,War,Western
0,420.0,3.0,2.0,1.0,0.0,7.0,4.0,0.0,20.0,0.0,2.0,1.0,1.0,7.0,8.0,2.0,9.0,3.0,0.0
1,819.0,4.0,2.0,0.0,2.0,9.0,5.0,0.0,14.0,0.0,1.0,0.0,1.0,3.0,8.0,1.0,5.0,3.0,1.0
2,568.0,8.0,5.0,0.0,2.0,21.0,10.0,1.0,41.0,1.0,5.0,2.0,5.0,9.0,17.0,4.0,12.0,8.0,2.0
3,186.0,30.0,13.0,5.0,10.0,13.0,13.0,0.0,25.0,2.0,3.0,2.0,4.0,10.0,7.0,7.0,51.0,2.0,3.0
4,702.0,16.0,9.0,1.0,2.0,6.0,2.0,0.0,6.0,0.0,0.0,4.0,3.0,2.0,3.0,11.0,8.0,4.0,0.0


In [154]:
UID = 402
c1 = df[df['userId'] == UID].iloc[:,1:]
r1 = df[df['userId'] != UID]

In [230]:
cosDF = pd.DataFrame()
for i in r1['userId']:
    r2 = r1[r1['userId'] == i].iloc[:,1:]
    cos = pd.DataFrame(zip(r1[r1['userId'] == i]['userId'],cosine_similarity(c1, r2)[0], [pearsonr(c1.values[0], r2.values[0])[0]]))
    cosDF = pd.concat([cosDF, cos])

In [231]:
cosDF

Unnamed: 0,0,1,2
0,420.0,0.894439,0.819205
0,819.0,0.940726,0.891081
0,568.0,0.928161,0.868096
0,186.0,0.793413,0.606798
0,702.0,0.813479,0.620758
...,...,...,...
0,406.0,0.971058,0.940336
0,791.0,0.903990,0.792427
0,39.0,0.906789,0.838711
0,473.0,0.871545,0.802111


In [None]:
# 撰寫為函數

In [223]:
def UB_CF(data, col, uid):
    
    CP_DF = pd.DataFrame() # 建立餘弦與皮爾森相似資料框
    
    df = data.copy()
    ist = df[col]
    df = df.drop(col, axis = 1)
    df.insert(0, col, ist)          # 將資料的「ID」、「人名」、「主鍵」，唯一識別器放置資料第一順位 
    
    t1 = df[df[col] == uid].iloc[:,1:] # 取出要參照的 ID 資料
    
    # 計算相似性
    for i in df[df[col] != uid][col]:
        t2 = df[df[col] == i].iloc[:,1:]
        t3 = pd.DataFrame(zip(df[df[col] == i][col]
                              , cosine_similarity(t1, t2)[0]                # 餘弦相關
                              , [pearsonr(t1.values[0], t2.values[0])[0]])  # 皮爾森相關
                         , columns=['ID', 'Cos', 'Peas'])       
        CP_DF = pd.concat([CP_DF, t3], axis = 0)
    
    return {'ID_TAG':uid,'Similar_martrix':CP_DF} # 將資料以字典方式存出為二個KEY值；一為參照ID，二為相似資料框架

In [236]:
UB_CF(movie, 'userId', 402)

{'ID_TAG': 402,
 'Similar_martrix':        ID       Cos      Peas
 0   420.0  0.894439  0.819205
 0   819.0  0.940726  0.891081
 0   568.0  0.928161  0.868096
 0   186.0  0.793413  0.606798
 0   702.0  0.813479  0.620758
 ..    ...       ...       ...
 0   406.0  0.971058  0.940336
 0   791.0  0.903990  0.792427
 0    39.0  0.906789  0.838711
 0   473.0  0.871545  0.802111
 0   925.0  0.637800  0.383055
 
 [942 rows x 3 columns]}

In [237]:
t1 = UB_CF(movie, 'userId', 402)

In [238]:
t1['ID_TAG']

402

In [239]:
t1['Similar_martrix']

Unnamed: 0,ID,Cos,Peas
0,420.0,0.894439,0.819205
0,819.0,0.940726,0.891081
0,568.0,0.928161,0.868096
0,186.0,0.793413,0.606798
0,702.0,0.813479,0.620758
...,...,...,...
0,406.0,0.971058,0.940336
0,791.0,0.903990,0.792427
0,39.0,0.906789,0.838711
0,473.0,0.871545,0.802111
