In [54]:
import numpy as np
import pandas as pd
import random

In [238]:
df = pd.read_csv('vectors_test.txt', delim_whitespace=True, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,300
0,新加坡,0.109441,0.295257,0.537623,-0.888996,0.795772,-0.523499,-0.258131,0.62576,0.434273,...,0.440788,-0.182576,-0.099394,-0.077858,0.422261,0.339778,-0.468129,-0.926128,0.063742,0.289209
1,玩,-0.033694,0.156811,0.517057,-0.451183,-0.020219,0.064383,-0.524204,1.045549,-0.14022,...,0.834588,0.408514,-0.236809,-0.077872,-0.813484,0.018177,0.150841,0.444122,0.52807,-0.088512
2,值得,-0.413511,0.732436,0.649847,-0.412557,0.025746,0.319165,-0.495262,0.888577,0.112376,...,0.361963,0.288443,0.524482,0.170921,-0.226716,-0.03915,-0.143796,-0.206068,0.84344,-0.398709
3,票,-0.733382,0.505235,-0.19522,0.024672,-0.275594,0.604765,-0.188473,0.353937,0.364895,...,-0.660591,0.075626,0.338854,0.366174,-0.60964,-0.023306,-0.221682,0.705237,0.289914,-0.202933
4,时间,0.680148,0.757347,-0.064182,-0.359637,0.362132,0.905252,-0.622642,0.0505,0.362557,...,0.287322,0.047993,0.465686,-0.212685,-0.525682,-0.19463,0.532594,-0.496485,0.653539,0.525584


In [224]:
matrix = df.iloc[:, 1:].values
matrix.shape

(11051, 300)

In [225]:
def get_centroid(matrix):
    return np.mean(matrix, axis=0)

In [226]:
## mahatten, euclidean, cosine similarity, correlation

def get_manhattan(a, b):
    return np.abs(a - b).sum()

def get_euclidean(a, b):
    return np.linalg.norm(a-b)

def get_cos_sim(a, b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

def get_corr(a, b):
    return np.correlate(a, b)[0]

In [227]:
## Randomly generate category centroids from top 100

top_100 = np.copy(matrix[:100,:])
random.shuffle(top_100)
clusters = np.split(top_100, 10)

centroids = []
for cluster in clusters:
    centroids.append(get_centroid(cluster))
    
centroids = np.asarray(centroids)
centroids.shape

(10, 300)

In [228]:
manhattan_distances = np.zeros((matrix.shape[0], 10))
euclidean_distances = np.zeros((matrix.shape[0], 10))
cos_sim = np.zeros((matrix.shape[0], 10))
correlations = np.zeros((matrix.shape[0], 10))

def get_distances(centroids, matrix):
    for i in range(matrix.shape[0]):
        for j in range(10):
            manhattan_distances[i, j] = get_manhattan(matrix[i], centroids[j])
            
    for i in range(matrix.shape[0]):
        for j in range(10):
            euclidean_distances[i, j] = get_euclidean(matrix[i], centroids[j])
            
    for i in range(matrix.shape[0]):
        for j in range(10):
            cos_sim[i, j] = get_cos_sim(matrix[i], centroids[j])
            
    for i in range(matrix.shape[0]):
        for j in range(10):
            correlations[i, j] = get_corr(matrix[i], centroids[j])

In [229]:
%%time 
get_distances(centroids, matrix)

CPU times: user 3.24 s, sys: 46.1 ms, total: 3.28 s
Wall time: 3.27 s


In [236]:
manhattan_distances.shape

(11051, 10)

In [282]:
def parse_to_df(matrix):
    final_df = pd.DataFrame(data=matrix)
    final_df['KEYWORD'] = df[0]
    cols = ['KEYWORD'] + [i for i in range(10)]
    colnames = ['KEYWORD'] + [i for i in range(1, 11)]
    final_df = final_df.loc[:, cols]
    final_df.columns = colnames
    return final_df

manhattan_df = parse_to_df(manhattan_distances)
manhattan_df.head()

Unnamed: 0,KEYWORD,1,2,3,4,5,6,7,8,9,10
0,新加坡,80.90574,122.978211,99.729136,123.454778,108.605478,117.336187,120.849328,120.592643,110.127038,129.012089
1,玩,105.599825,85.699556,101.389573,99.920014,100.067771,116.277977,102.393803,110.58104,106.181145,112.638921
2,值得,107.2979,96.572027,89.835689,96.272844,93.822909,94.638665,84.53236,96.741704,96.549493,100.468155
3,票,104.379867,102.152127,106.780222,100.659322,98.516262,104.393744,102.358315,98.063149,100.409028,102.307207
4,时间,101.479416,88.860665,104.735182,96.515189,97.823204,99.062752,95.344035,98.063075,99.288081,96.91671


In [281]:
manhattan_df['CLOSEST'] = manhattan_df.iloc[:,1:11].min(axis=1)
manhattan_df.head()

Unnamed: 0,KEYWORD,1,2,3,4,5,6,7,8,9,10,CLOSEST
0,新加坡,80.90574,122.978211,99.729136,123.454778,108.605478,117.336187,120.849328,120.592643,110.127038,129.012089,80.90574
1,玩,105.599825,85.699556,101.389573,99.920014,100.067771,116.277977,102.393803,110.58104,106.181145,112.638921,85.699556
2,值得,107.2979,96.572027,89.835689,96.272844,93.822909,94.638665,84.53236,96.741704,96.549493,100.468155,84.53236
3,票,104.379867,102.152127,106.780222,100.659322,98.516262,104.393744,102.358315,98.063149,100.409028,102.307207,98.063149
4,时间,101.479416,88.860665,104.735182,96.515189,97.823204,99.062752,95.344035,98.063075,99.288081,96.91671,88.860665
