# 餘弦計算

In [256]:
reset -f

In [257]:
import numpy as np
import pandas as pd
import pyodbc

from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr

In [28]:
# 一般資料
a = [1,2,3,4,5]
b = [3,4,3,6,9]

print('Type = {}, Len = {}'.format(type(a), len(a)))
print('Type = {}, Len = {}'.format(type(b), len(b)))

print(cosine_similarity(a,b))

# 程式無法執行的原因在於：Expected 2D array, got 1D array instead
# a, b 二個各自的元素，因為是存於list資料結構下的第一層，所以有各有5個元素
# 需將各自資料結構內的元素，變成「一組」才能計算…


Type = <class 'list'>, Len = 5
Type = <class 'list'>, Len = 5


ValueError: Expected 2D array, got 1D array instead:
array=[1. 2. 3. 4. 5.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [29]:
# 一般資料-修正
a = [1,2,3,4,5]
b = [3,4,3,6,9]

a = [a] # 將 a 的元素，變成一個list，資料下的一組
b = [b] # 將 b 的元素，變成一個list，資料下的一組

print('Type = {}, Len = {}'.format(type(a), len(a)))
print('Type = {}, Len = {}'.format(type(b), len(b)))

print(cosine_similarity(a,b))

# 程式無法執行的原因在於：Expected 2D array, got 1D array instead
# a, b 二個各自的元素，因為是存於list資料結構下的第一層，所以有各有5個元素
# 需將各自資料結構內的元素，變成「一組」才能計算…

Type = <class 'list'>, Len = 1
Type = <class 'list'>, Len = 1
[[0.9766078]]


In [37]:
# 一般資料 & array 差異

a_list = [1, 3, 5]
a_array = np.array([1, 3, 5])

print(a_list, a_array)
print(type(a_list), type(a_array))

[1, 3, 5] [1 3 5]
<class 'list'> <class 'numpy.ndarray'>


# 電影資料實測

In [258]:
conn = pyodbc.connect('DRIVER={SQL Server}; SERVER=(local); DATABASE=MLDATASET;')

In [259]:
SqlStr = 'SELECT * FROM MOVIE_SUMMARY'

In [260]:
movie = pd.read_sql(sql=SqlStr, con=conn)

In [261]:
df = movie.copy()

In [262]:
df.head()

Unnamed: 0,userId,ACTION,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film,Horror,Musical,Mystery,Romance,Sci,Thriller,War,Western
0,420.0,3.0,2.0,1.0,0.0,7.0,4.0,0.0,20.0,0.0,2.0,1.0,1.0,7.0,8.0,2.0,9.0,3.0,0.0
1,819.0,4.0,2.0,0.0,2.0,9.0,5.0,0.0,14.0,0.0,1.0,0.0,1.0,3.0,8.0,1.0,5.0,3.0,1.0
2,568.0,8.0,5.0,0.0,2.0,21.0,10.0,1.0,41.0,1.0,5.0,2.0,5.0,9.0,17.0,4.0,12.0,8.0,2.0
3,186.0,30.0,13.0,5.0,10.0,13.0,13.0,0.0,25.0,2.0,3.0,2.0,4.0,10.0,7.0,7.0,51.0,2.0,3.0
4,702.0,16.0,9.0,1.0,2.0,6.0,2.0,0.0,6.0,0.0,0.0,4.0,3.0,2.0,3.0,11.0,8.0,4.0,0.0


In [154]:
UID = 402
c1 = df[df['userId'] == UID].iloc[:,1:]
r1 = df[df['userId'] != UID]

In [230]:
cosDF = pd.DataFrame()
for i in r1['userId']:
    r2 = r1[r1['userId'] == i].iloc[:,1:]
    cos = pd.DataFrame(zip(r1[r1['userId'] == i]['userId'],cosine_similarity(c1, r2)[0], [pearsonr(c1.values[0], r2.values[0])[0]]))
    cosDF = pd.concat([cosDF, cos])

In [231]:
cosDF

Unnamed: 0,0,1,2
0,420.0,0.894439,0.819205
0,819.0,0.940726,0.891081
0,568.0,0.928161,0.868096
0,186.0,0.793413,0.606798
0,702.0,0.813479,0.620758
...,...,...,...
0,406.0,0.971058,0.940336
0,791.0,0.903990,0.792427
0,39.0,0.906789,0.838711
0,473.0,0.871545,0.802111


In [None]:
# 撰寫為函數

In [263]:
def UB_CF(data, col, uid):
    
    CP_DF = pd.DataFrame() # 建立餘弦與皮爾森相似資料框
    
    df = data.copy()
    ist = df[col]
    df = df.drop(col, axis = 1)
    df.insert(0, col, ist)          # 將資料的「ID」、「人名」、「主鍵」，唯一識別器放置資料第一順位 
    
    t1 = df[df[col] == uid].iloc[:,1:] # 取出要參照的 ID 資料
    
    # 計算相似性
    for i in df[df[col] != uid][col]:
        t2 = df[df[col] == i].iloc[:,1:]
        t3 = pd.DataFrame(zip(df[df[col] == i][col]
                              , cosine_similarity(t1, t2)[0]                # 餘弦相關
                              , [pearsonr(t1.values[0], t2.values[0])[0]])  # 皮爾森相關
                         , columns=['ID', 'Cos', 'Peas'])       
        CP_DF = pd.concat([CP_DF, t3], axis = 0)
    
    return {'ID_TAG':uid,'Similar_martrix':CP_DF} # 將資料以字典方式存出為二個KEY值；一為參照ID，二為相似資料框架

In [264]:
UB_CF(data = movie, col = 'userId', uid = 402)

{'ID_TAG': 402,
 'Similar_martrix':        ID       Cos      Peas
 0   420.0  0.894439  0.819205
 0   819.0  0.940726  0.891081
 0   568.0  0.928161  0.868096
 0   186.0  0.793413  0.606798
 0   702.0  0.813479  0.620758
 ..    ...       ...       ...
 0   406.0  0.971058  0.940336
 0   791.0  0.903990  0.792427
 0    39.0  0.906789  0.838711
 0   473.0  0.871545  0.802111
 0   925.0  0.637800  0.383055
 
 [942 rows x 3 columns]}

In [265]:
t1 = UB_CF(movie, 'userId', 402)

In [266]:
t1['ID_TAG']

402

In [267]:
t1['Similar_martrix']

Unnamed: 0,ID,Cos,Peas
0,420.0,0.894439,0.819205
0,819.0,0.940726,0.891081
0,568.0,0.928161,0.868096
0,186.0,0.793413,0.606798
0,702.0,0.813479,0.620758
...,...,...,...
0,406.0,0.971058,0.940336
0,791.0,0.903990,0.792427
0,39.0,0.906789,0.838711
0,473.0,0.871545,0.802111


In [268]:
t1['Similar_martrix'].sort_values('Peas', ascending=False)

Unnamed: 0,ID,Cos,Peas
0,533.0,0.991529,0.982998
0,566.0,0.990872,0.981267
0,916.0,0.988799,0.978482
0,457.0,0.988582,0.977402
0,624.0,0.985695,0.976144
...,...,...,...
0,604.0,0.579493,0.272680
0,822.0,0.629211,0.269629
0,366.0,0.467955,0.164612
0,814.0,0.483623,0.160718


# 相似矩陣測試

In [272]:
from tqdm import tqdm

In [273]:
idlist = [i for i in movie['userId'][:8]] # 前8個

In [274]:
idlist

[420.0, 819.0, 568.0, 186.0, 702.0, 246.0, 836.0, 303.0]

In [275]:
# 生成相似矩陣
df = pd.DataFrame() # 新建資料框架
for i in tqdm(idlist):
    s1 = UB_CF(data = movie, col= 'userId', uid = i) # 相似矩陣計算
    s2 = s1['Similar_martrix'].sort_values(by = 'Cos',ascending = False)[:5].copy() # 選出前5個最相似ID
    s2['tag'] = s1['ID_TAG']
    df = pd.concat([df, s2], axis = 0)   



  0%|                                                                                            | 0/8 [00:00<?, ?it/s][A[A

 12%|██████████▌                                                                         | 1/8 [00:05<00:35,  5.03s/it][A[A

 25%|█████████████████████                                                               | 2/8 [00:10<00:30,  5.13s/it][A[A

 38%|███████████████████████████████▌                                                    | 3/8 [00:15<00:25,  5.11s/it][A[A

 50%|██████████████████████████████████████████                                          | 4/8 [00:20<00:20,  5.12s/it][A[A

 62%|████████████████████████████████████████████████████▌                               | 5/8 [00:25<00:15,  5.14s/it][A[A

 75%|███████████████████████████████████████████████████████████████                     | 6/8 [00:31<00:10,  5.29s/it][A[A

 88%|█████████████████████████████████████████████████████████████████████████▌          | 7/8 [00:36<00:05, 

In [276]:
df.head()

Unnamed: 0,ID,Cos,Peas,tag
0,111.0,0.992023,0.98696,420.0
0,284.0,0.987611,0.979515,420.0
0,836.0,0.985975,0.977447,420.0
0,491.0,0.979387,0.971123,420.0
0,713.0,0.97881,0.967277,420.0


In [277]:
df2 = pd.merge(left=df, right=movie, how='inner', left_on='ID', right_on='userId').iloc[:,0:10].drop('userId', axis = 1).sort_values(by = 'tag',ascending = False)

In [278]:
df2['ref'] = df2['tag']

In [279]:
df2.head()

Unnamed: 0,ID,Cos,Peas,tag,ACTION,Adventure,Animation,Children,Comedy,ref
33,47.0,0.984723,0.973897,836.0,1.0,1.0,0.0,1.0,5.0,836.0
34,656.0,0.977122,0.976149,836.0,4.0,0.0,0.0,0.0,6.0,836.0
9,568.0,0.979508,0.963737,836.0,8.0,5.0,0.0,2.0,21.0,836.0
13,556.0,0.983793,0.97127,836.0,5.0,4.0,0.0,2.0,12.0,836.0
32,420.0,0.985975,0.977447,836.0,3.0,2.0,1.0,0.0,7.0,836.0


In [280]:
df2.columns

Index(['ID', 'Cos', 'Peas', 'tag', 'ACTION', 'Adventure', 'Animation',
       'Children', 'Comedy', 'ref'],
      dtype='object')

In [281]:
col = ['ref', 'ID', 'Cos', 'Peas', 'ACTION', 'Adventure', 'Animation',
       'Children', 'Comedy']

In [282]:
df3 = df2[col]

In [283]:
df3.head()

Unnamed: 0,ref,ID,Cos,Peas,ACTION,Adventure,Animation,Children,Comedy
33,836.0,47.0,0.984723,0.973897,1.0,1.0,0.0,1.0,5.0
34,836.0,656.0,0.977122,0.976149,4.0,0.0,0.0,0.0,6.0
9,836.0,568.0,0.979508,0.963737,8.0,5.0,0.0,2.0,21.0
13,836.0,556.0,0.983793,0.97127,5.0,4.0,0.0,2.0,12.0
32,836.0,420.0,0.985975,0.977447,3.0,2.0,1.0,0.0,7.0


# 商品籃OO

In [299]:
# 物件導向...

class Basket:
    
    def __init__(self, data):
        self.data = data.copy()                        # 避免更動原資料
        self.data.index = range(0, len(self.data))     # 資料索引歸0
        print('Basketbuilding')
        
    def matrixIndex(self, item, method = 'Cos'):       # item：欄位設定；相似度計算預算為餘弦(method)
        self.CPcols = []                               
        cols = item
        for i in cols:                                 # 商品籃與相似度計算
            self.data = pd.concat([self.data, pd.DataFrame(self.data['Cos'] * self.data[i], columns=[method + '_' + i])], axis = 1)
            self.CPcols.append(self.data.columns[-1])
        return self.data
    
    def refoutput(self, flag, uid):                    # 輸出推薦商品排序
        df = pd.DataFrame(self.data[self.data[flag] == uid][self.CPcols].apply(np.sum, axis = 0).sort_values(ascending = False), columns=['Item_Index'])
        df['ID'] = uid
        return df
    
            
        

In [300]:
# 原始資料檢視
df3.head()

Unnamed: 0,ref,ID,Cos,Peas,ACTION,Adventure,Animation,Children,Comedy
33,836.0,47.0,0.984723,0.973897,1.0,1.0,0.0,1.0,5.0
34,836.0,656.0,0.977122,0.976149,4.0,0.0,0.0,0.0,6.0
9,836.0,568.0,0.979508,0.963737,8.0,5.0,0.0,2.0,21.0
13,836.0,556.0,0.983793,0.97127,5.0,4.0,0.0,2.0,12.0
32,836.0,420.0,0.985975,0.977447,3.0,2.0,1.0,0.0,7.0


In [301]:
# 資料輸入
t1 = Basket(df3)

Basketbuilding


In [302]:
t1.data.head()

Unnamed: 0,ref,ID,Cos,Peas,ACTION,Adventure,Animation,Children,Comedy
0,836.0,47.0,0.984723,0.973897,1.0,1.0,0.0,1.0,5.0
1,836.0,656.0,0.977122,0.976149,4.0,0.0,0.0,0.0,6.0
2,836.0,568.0,0.979508,0.963737,8.0,5.0,0.0,2.0,21.0
3,836.0,556.0,0.983793,0.97127,5.0,4.0,0.0,2.0,12.0
4,836.0,420.0,0.985975,0.977447,3.0,2.0,1.0,0.0,7.0


In [303]:
# 相似度指數計算
t1.matrixIndex(item=['ACTION', 'Adventure', 'Animation', 'Children'], method='Cos').head()

Unnamed: 0,ref,ID,Cos,Peas,ACTION,Adventure,Animation,Children,Comedy,Cos_ACTION,Cos_Adventure,Cos_Animation,Cos_Children
0,836.0,47.0,0.984723,0.973897,1.0,1.0,0.0,1.0,5.0,0.984723,0.984723,0.0,0.984723
1,836.0,656.0,0.977122,0.976149,4.0,0.0,0.0,0.0,6.0,3.908489,0.0,0.0,0.0
2,836.0,568.0,0.979508,0.963737,8.0,5.0,0.0,2.0,21.0,7.836066,4.897541,0.0,1.959017
3,836.0,556.0,0.983793,0.97127,5.0,4.0,0.0,2.0,12.0,4.918964,3.935171,0.0,1.967586
4,836.0,420.0,0.985975,0.977447,3.0,2.0,1.0,0.0,7.0,2.957925,1.97195,0.985975,0.0


In [304]:
t1.CPcols

['Cos_ACTION', 'Cos_Adventure', 'Cos_Animation', 'Cos_Children']

In [305]:
t1.refoutput(flag='ref', uid=186)

Unnamed: 0,Item_Index,ID
Cos_ACTION,99.038499,186
Cos_Adventure,38.454868,186
Cos_Children,6.74352,186
Cos_Animation,3.84373,186


In [306]:
t1.data[t1.data['ref'] == 186]

Unnamed: 0,ref,ID,Cos,Peas,ACTION,Adventure,Animation,Children,Comedy,Cos_ACTION,Cos_Adventure,Cos_Animation,Cos_Children
35,186.0,164.0,0.956541,0.924335,27.0,12.0,0.0,0.0,7.0,25.826601,11.478489,0.0,0.0
36,186.0,362.0,0.956898,0.933777,10.0,0.0,0.0,0.0,5.0,9.568985,0.0,0.0,0.0
37,186.0,772.0,0.963031,0.934129,12.0,3.0,0.0,2.0,5.0,11.556377,2.889094,0.0,1.926063
38,186.0,396.0,0.973727,0.959713,25.0,10.0,1.0,2.0,6.0,24.343171,9.737268,0.973727,1.947454
39,186.0,54.0,0.956668,0.920741,29.0,15.0,3.0,3.0,11.0,27.743365,14.350016,2.870003,2.870003
