In [1]:
### Check Working directory

import os
print("Working dir: ", os.getcwd())

os.chdir(os.pardir)         # change working dir to parents dir
print("Changed dir: ", os.getcwd())

Working dir:  /Users/tori_mini/Documents/scripts/rec_system/py_src
Changed dir:  /Users/tori_mini/Documents/scripts/rec_system


In [2]:
### import raw data

import numpy as np
import pandas as pd

df = pd.read_excel('raw_data/refined_data.xlsx')

# glimpse data
print(" ==================== Data Distribution Summary ====================")
print(df)

                                   ID   EXT   NEU   AGR   CON   OPN
0    00419a4c96b32cd63b2c7196da761274  1.90  4.15  3.15  4.65  3.45
1    02c37028a782cfda660c7243e45244bb  2.05  4.00  2.20  2.20  3.75
2    03133a828cd0cf52e3752813ce5d818f  3.84  3.55  2.85  4.32  3.45
3    03e6c4eca4269c183fa0e1780f73faba  3.20  3.60  3.85  4.35  4.80
4    06b055f8e2bca96496514891057913c3  2.85  2.35  3.35  4.70  3.35
..                                ...   ...   ...   ...   ...   ...
245  f83aa7290cf3ae8ed75d6aee5ebbb559  2.65  1.70  3.95  3.00  3.15
246  fbe5aa478508d1dc931427ade5d9e1b4  3.25  2.75  2.25  4.00  4.00
247  fc1c9fb6c64740edcbbf8cfe9dde8b02  3.55  2.25  3.90  2.80  4.00
248  fe22087986fdcc65939c793fe0ec90a9  3.45  2.55  3.15  3.50  2.90
249  fffafe151f07a30a0ede2038a897b680  3.05  1.80  3.35  2.95  4.35

[250 rows x 6 columns]


In [3]:
### Cosine similarity

from sklearn.metrics.pairwise import cosine_similarity

def calc_cosine_similarity(df):

    # DataFrame to NumPy array
    data = df.iloc[:, 1:].values        # select factor columns

    # calculation cosine similarity about all items
    cosine_similarity_matrix = cosine_similarity(data)

    # declare a list to save data
    similarity_results = []

    # for loop for cosine similarity
    for i in range((len(cosine_similarity_matrix))):
        for j in range(i + 1, len(cosine_similarity_matrix)):
            similarity_results.append([i +1, j +1, cosine_similarity_matrix[i, j]])
    
    # save data as pandas DataFrame
    similarity_df = pd.DataFrame(similarity_results, columns = ["Row1", "Row2", "Cosine Similarity"])

    # glimpse data
    print(" ==================== Data Distribution Summary ====================")
    print(similarity_df.describe())
    print("\n")
    print(similarity_df)

    # save data
    similarity_df.to_csv('result/cos_sim.csv')
    print('"Cosine Similarity" saved as a CSV file.')

calc_cosine_similarity(df)

               Row1          Row2  Cosine Similarity
count  31125.000000  31125.000000       31125.000000
mean      83.666667    167.333333           0.964076
std       58.807596     58.807596           0.027477
min        1.000000      2.000000           0.785344
25%       34.000000    126.000000           0.951067
50%       74.000000    177.000000           0.971214
75%      125.000000    217.000000           0.984393
max      249.000000    250.000000           0.999927


       Row1  Row2  Cosine Similarity
0         1     2           0.951905
1         1     3           0.966918
2         1     4           0.975332
3         1     5           0.966944
4         1     6           0.958614
...     ...   ...                ...
31120   247   249           0.979177
31121   247   250           0.968925
31122   248   249           0.980273
31123   248   250           0.992880
31124   249   250           0.968722

[31125 rows x 3 columns]
"Cosine Similarity" saved as a CSV file.


In [4]:
### Euclidean distance

from sklearn.metrics.pairwise import euclidean_distances

def calc_euclidean_distances(df, col_range):

    col_name = list(df.columns)
    
    euclidean_results = []

    for col_idx in col_range:
        # DataFrame to NumPy array
        data_column = df.iloc[:, col_idx].values.reshape(-1, 1)

        # calculation cosine similarity about all items
        euclidean_dist_matrix = euclidean_distances(data_column)
        
        # for loop for euclidean distance
        for i in range(len(euclidean_dist_matrix)):
            for j in range(i + 1, len(euclidean_dist_matrix)):
                euclidean_results.append([i + 1, j + 1, col_name[col_idx], euclidean_dist_matrix[i, j]])
    
    # save result of cosine similarity (i != j)
    euclidean_df = pd.DataFrame(euclidean_results, columns=["Row1", "Row2", "Factor", "Euclidean_Distance"])

    # glimpse data
    print(" ==================== Data Distribution Summary ====================")
    print(euclidean_df.describe())
    print("\n")
    print(euclidean_df)

    # save data
    euclidean_df.to_csv('result/euclidean_dist.csv')
    print('"Euclidean Distance" saved as a CSV file.')

col_range = range(1, len(list(df.columns)))
calc_euclidean_distances(df, col_range)

                Row1           Row2  Euclidean_Distance
count  155625.000000  155625.000000       155625.000000
mean       83.666667     167.333333            0.822543
std        58.806840      58.806840            0.626909
min         1.000000       2.000000            0.000000
25%        34.000000     126.000000            0.300000
50%        74.000000     177.000000            0.700000
75%       125.000000     217.000000            1.200000
max       249.000000     250.000000            3.670000


        Row1  Row2 Factor  Euclidean_Distance
0          1     2    EXT                0.15
1          1     3    EXT                1.94
2          1     4    EXT                1.30
3          1     5    EXT                0.95
4          1     6    EXT                1.10
...      ...   ...    ...                 ...
155620   247   249    OPN                1.10
155621   247   250    OPN                0.35
155622   248   249    OPN                1.10
155623   248   250    OPN         