In [26]:
import numpy as np
import pandas as pd
import random

In [27]:
df = pd.read_csv('vectors_test.txt', delim_whitespace=True, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,新加坡,0.109441,0.295257,0.537623,-0.888996,0.795772,-0.523499,-0.258131,0.62576,0.434273,...,0.440788,-0.182576,-0.099394,-0.077858,0.422261,0.339778,-0.468129,-0.926128,0.063742,0.289209
1,玩,-0.033694,0.156811,0.517057,-0.451183,-0.020219,0.064383,-0.524204,1.045549,-0.14022,...,0.834588,0.408514,-0.236809,-0.077872,-0.813484,0.018177,0.150841,0.444122,0.52807,-0.088512
2,值得,-0.413511,0.732436,0.649847,-0.412557,0.025746,0.319165,-0.495262,0.888577,0.112376,...,0.361963,0.288443,0.524482,0.170921,-0.226716,-0.03915,-0.143796,-0.206068,0.84344,-0.398709
3,票,-0.733382,0.505235,-0.19522,0.024672,-0.275594,0.604765,-0.188473,0.353937,0.364895,...,-0.660591,0.075626,0.338854,0.366174,-0.60964,-0.023306,-0.221682,0.705237,0.289914,-0.202933
4,时间,0.680148,0.757347,-0.064182,-0.359637,0.362132,0.905252,-0.622642,0.0505,0.362557,...,0.287322,0.047993,0.465686,-0.212685,-0.525682,-0.19463,0.532594,-0.496485,0.653539,0.525584


In [28]:
matrix = df.iloc[:, 1:].values
matrix.shape

(11051, 300)

In [29]:
no_of_categories = 11

In [30]:
def get_centroid(matrix):
    return np.mean(matrix, axis=0)

In [31]:
## mahatten, euclidean, cosine similarity, correlation

def get_manhattan(a, b):
    return np.abs(a - b).sum()

def get_euclidean(a, b):
    return np.linalg.norm(a-b)

def get_cos_sim(a, b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

def get_corr(a, b):
    return np.correlate(a, b)[0]

In [32]:
## Randomly generate category centroids from top 100

top_n = np.copy(matrix[:no_of_categories*10, :])
random.shuffle(top_n)
clusters = np.split(top_n, no_of_categories)

centroids = []
for cluster in clusters:
    centroids.append(get_centroid(cluster))
    
centroids = np.asarray(centroids)
centroids.shape

(11, 300)

In [33]:
manhattan_distances = np.zeros((matrix.shape[0], no_of_categories))
euclidean_distances = np.zeros((matrix.shape[0], no_of_categories))
cos_sim = np.zeros((matrix.shape[0], no_of_categories))
correlations = np.zeros((matrix.shape[0], no_of_categories))

def get_distances(centroids, matrix):
    for i in range(matrix.shape[0]):
        for j in range(no_of_categories):
            manhattan_distances[i, j] = get_manhattan(matrix[i], centroids[j])
            
    for i in range(matrix.shape[0]):
        for j in range(no_of_categories):
            euclidean_distances[i, j] = get_euclidean(matrix[i], centroids[j])
            
    for i in range(matrix.shape[0]):
        for j in range(no_of_categories):
            cos_sim[i, j] = get_cos_sim(matrix[i], centroids[j])
            
    for i in range(matrix.shape[0]):
        for j in range(no_of_categories):
            correlations[i, j] = get_corr(matrix[i], centroids[j])

In [34]:
%%time 
get_distances(centroids, matrix)

CPU times: user 3.72 s, sys: 19.4 ms, total: 3.74 s
Wall time: 3.7 s


In [35]:
manhattan_distances.shape

(11051, 11)

In [36]:
def parse_to_df(matrix):
    final_df = pd.DataFrame(data=matrix)
    final_df['KEYWORD'] = df[0]
    cols = ['KEYWORD'] + [i for i in range(no_of_categories)]
    colnames = ['KEYWORD'] + [i for i in range(1, no_of_categories+1)]
    final_df = final_df.loc[:, cols]
    final_df.columns = colnames
    return final_df

manhattan_df = parse_to_df(manhattan_distances)
manhattan_df.head()

Unnamed: 0,KEYWORD,1,2,3,4,5,6,7,8,9,10,11
0,新加坡,92.317975,107.066505,127.263892,93.914772,120.37915,124.389095,121.40012,122.13716,123.086927,120.04129,123.501178
1,玩,89.030898,109.37138,106.632261,114.249332,106.228058,103.833656,109.387378,109.359585,111.49496,113.544791,96.025907
2,值得,84.545496,95.229495,99.719688,98.763281,85.958967,86.044239,95.039442,89.371342,97.230835,100.664753,96.042192
3,票,109.737668,101.596671,103.347272,109.14018,99.589848,95.660225,95.856264,97.404024,93.165697,108.537673,98.413517
4,时间,95.930932,99.897948,80.102679,94.631386,89.004047,98.097256,87.519456,96.233759,96.019486,103.597601,87.033281


In [37]:
def get_closest_index(row):
    dictionary = dict(zip(row, list(range(1, no_of_categories+1))))
    return dictionary[min(row)]

manhattan_df['CLOSEST'] = manhattan_df.iloc[:,1:no_of_categories+1].apply(
    get_closest_index, 
    axis=1
)
manhattan_df.head()

Unnamed: 0,KEYWORD,1,2,3,4,5,6,7,8,9,10,11,CLOSEST
0,新加坡,92.317975,107.066505,127.263892,93.914772,120.37915,124.389095,121.40012,122.13716,123.086927,120.04129,123.501178,1
1,玩,89.030898,109.37138,106.632261,114.249332,106.228058,103.833656,109.387378,109.359585,111.49496,113.544791,96.025907,1
2,值得,84.545496,95.229495,99.719688,98.763281,85.958967,86.044239,95.039442,89.371342,97.230835,100.664753,96.042192,1
3,票,109.737668,101.596671,103.347272,109.14018,99.589848,95.660225,95.856264,97.404024,93.165697,108.537673,98.413517,9
4,时间,95.930932,99.897948,80.102679,94.631386,89.004047,98.097256,87.519456,96.233759,96.019486,103.597601,87.033281,3
