In [1]:
import numpy as np
import pandas as pd
import random

In [19]:
path_to_text = 'vectors.txt'
path_to_excel = '20200304_Aspects_Annotation.xlsx'
train_size = 400
annotated_size = 500
count_limit = 5

def get_final_dfs(
    path_to_text,
    path_to_excel,
    train_size,
    annotated_size,
    count_limit
):
    vectors_df, keywords_df = get_parsed_dfs(path_to_text, path_to_excel, count_limit=5)
    aspect_categories = get_aspect_categories(keywords_df)
    aspect_categories_numbered = dict(zip(range(1, 12), aspect_categories))
    no_of_categories = len(aspect_categories)
    matrix = get_matrix(vectors_df)
    centroids = get_centroids(keywords_df, matrix, aspect_categories, train_size)
    print("\nCalculating distances and similarities...")
    manhattan_distances, euclidean_distances, cos_sim, corr = get_distances(
        centroids, 
        matrix, 
        no_of_categories
    )
    manhattan_df = parse_to_df(manhattan_distances, vectors_df, no_of_categories)
    euclidean_df = parse_to_df(euclidean_distances, vectors_df, no_of_categories)
    cos_sim_df = parse_to_df(cos_sim, vectors_df, no_of_categories)
    corr_df = parse_to_df(corr, vectors_df, no_of_categories)
    
    def get_closest_index(row):
        dictionary = dict(zip(row, list(range(1, no_of_categories+1))))
        return dictionary[min(row)]

    def get_furthest_index(row):
        dictionary = dict(zip(row, list(range(1, no_of_categories+1))))
        return dictionary[max(row)]
    
    for df in [manhattan_df, euclidean_df]:
        df['CLOSEST'] = df.iloc[:, 1:no_of_categories+1].apply(
            get_closest_index, 
            axis=1
        )
        df['PREDICTION'] = [aspect_categories_numbered[x] for x in df['CLOSEST']]

    for df in [cos_sim_df, corr_df]:
        df['CLOSEST'] = df.iloc[:, 1:no_of_categories+1].apply(
            get_furthest_index, 
            axis=1
        )
        df['PREDICTION'] = [aspect_categories_numbered[x] for x in df['CLOSEST']]
    
    test_indices = get_test_indices(keywords_df, annotated_size, aspect_categories)
    for df in [manhattan_df, euclidean_df, cos_sim_df, corr_df]:
        get_accuracy(df, keywords_df, test_indices)
        
    return manhattan_df, euclidean_df, cos_sim_df, corr_df

In [20]:
manhattan_df, euclidean_df, cos_sim_df, corr_df = get_final_dfs(
    path_to_text,
    path_to_excel,
    train_size,
    annotated_size,
    count_limit
)

Activities: 14
Environment and Ambience: 24
Experience: 24
F&B: 19
Facilities: 6
Family/Children family/ Friends: 12
Price: 13
Service: 3
Shopping: 7
Sightseeing/ Place: 103
Transportation and Accessibility: 51

Centroids found.

Calculating distances and similarities...
Accuracy: 0.4426229508196721
Accuracy: 0.47540983606557374
Accuracy: 0.5409836065573771
Accuracy: 0.2786885245901639


In [23]:
euclidean_df

Unnamed: 0,KEYWORD,1,2,3,4,5,6,7,8,9,10,11,CLOSEST,PREDICTION
0,singapore,7.279409,7.335682,7.593765,7.493676,8.030168,7.413504,7.431632,7.557792,7.907986,7.144337,7.446322,10,Sightseeing/ Place
1,place,8.149188,8.073834,8.118068,8.091221,8.491088,7.874137,8.203072,8.502677,8.229489,7.779179,8.064588,10,Sightseeing/ Place
2,gardens,8.519800,8.005451,8.154163,8.523501,8.496930,8.270097,8.231482,8.863200,9.054886,7.878904,8.172181,10,Sightseeing/ Place
3,time,7.936551,8.164306,8.360109,8.445423,8.557802,8.140479,8.315590,7.864341,8.762398,8.029203,8.152548,8,Service
4,night,8.439204,8.188341,8.250031,8.290686,8.338998,8.388787,8.365154,9.025342,8.774355,7.962126,8.152220,10,Sightseeing/ Place
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20627,zoo prices,3.428219,3.378686,2.912206,4.251268,4.055024,4.227530,3.507467,4.583581,4.414550,3.283787,3.341650,3,Experience
20628,zoo site,3.551383,3.300760,2.824811,4.258702,3.893915,4.221771,3.476362,4.696605,4.489624,3.183822,3.225293,3,Experience
20629,zoo tour,3.359664,3.074218,2.693026,3.987548,3.650784,3.992565,3.244321,4.512768,4.257007,3.033220,2.913782,3,Experience
20630,zoo yesterday,3.374322,3.101723,2.793212,3.970116,3.703749,4.086302,3.186965,4.429220,4.235138,3.088817,3.096675,3,Experience


In [4]:
def get_vectors_df(path_to_text): 
    vectors_df = pd.read_csv(path_to_text, delim_whitespace=True, header=None)
    parsed_col = []
    for keyword in list(vectors_df[0]):
        keyword = str(keyword)
        keyword = keyword.replace('_', ' ')
        parsed_col.append(keyword)
    vectors_df[0] = parsed_col
    vectors_df.columns = ['TEXT'] + list(range(1, vectors_df.shape[1]))
    return vectors_df

In [5]:
def get_keywords_df(path_to_excel, count_limit): 
    keywords_df = pd.read_excel(path_to_excel)
    keywords_df = keywords_df[keywords_df['count'] >= count_limit]
    parsed_col_2 = []
    for keyword in list(keywords_df['TEXT']):
        keyword = str(keyword)
        keyword = keyword.replace('â€™', "'")
        keyword = keyword.replace('ã©', 'é')
        parsed_col_2.append(keyword)
    keywords_df['TEXT'] = parsed_col_2
    keywords_df.drop_duplicates('TEXT', inplace=True) # There are duplicates after the .replace() functions
    return keywords_df

In [6]:
def get_parsed_dfs(path_to_text, path_to_excel, count_limit=5):
    vectors_df = get_vectors_df(path_to_text)
    keywords_df = get_keywords_df(path_to_excel, count_limit=count_limit)
    joint_df = vectors_df.merge(keywords_df)
    vectors_df = joint_df.iloc[:, :vectors_df.shape[1]]
    keywords_df = joint_df.iloc[:, vectors_df.shape[1]:]
    return vectors_df, keywords_df

In [7]:
def get_matrix(vectors_df):
    return vectors_df.iloc[:, 1:].values

In [8]:
def get_centroid(matrix):
    return np.mean(matrix, axis=0)

In [9]:
def get_manhattan(a, b):
    return np.abs(a - b).sum()

def get_euclidean(a, b):
    return np.linalg.norm(a-b)

def get_cos_sim(a, b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

def get_corr(a, b):
    return np.correlate(a, b)[0]

In [10]:
def get_aspect_categories(keywords_df):
    aspect_categories = list(set(keywords_df.ASPECT_CATEGORY_NAME))
    aspect_categories = [x for x in aspect_categories if x is not np.nan]
    aspect_categories.sort()
    return aspect_categories

In [11]:
def get_centroids(keywords_df, matrix, aspect_categories, train_size=400):
    df_train = keywords_df.iloc[:train_size, :]
    no_of_categories = len(aspect_categories)
    centroids = np.zeros((no_of_categories, 300))
    for i in range(no_of_categories):
        indices = df_train[df_train.ASPECT_CATEGORY_NAME == aspect_categories[i]].index
        print('{}: {}'.format(aspect_categories[i], len(indices)))
        centroids[i] = get_centroid(matrix[indices])
    print('\nCentroids found.')
    return centroids

In [12]:
def get_distances(centroids, matrix, no_of_categories):  # Returns tuple of 4 matrices
    manhattan_distances = np.zeros((matrix.shape[0], no_of_categories))
    euclidean_distances = np.zeros((matrix.shape[0], no_of_categories))
    cos_sim = np.zeros((matrix.shape[0], no_of_categories))
    corr = np.zeros((matrix.shape[0], no_of_categories))

    for i in range(matrix.shape[0]):
        for j in range(no_of_categories):
            manhattan_distances[i, j] = get_manhattan(matrix[i], centroids[j])

    for i in range(matrix.shape[0]):
        for j in range(no_of_categories):
            euclidean_distances[i, j] = get_euclidean(matrix[i], centroids[j])

    for i in range(matrix.shape[0]):
        for j in range(no_of_categories):
            cos_sim[i, j] = get_cos_sim(matrix[i], centroids[j])

    for i in range(matrix.shape[0]):
        for j in range(no_of_categories):
            corr[i, j] = get_corr(matrix[i], centroids[j])
                
    return manhattan_distances, euclidean_distances, cos_sim, corr

In [13]:
def parse_to_df(matrix, vectors_df, no_of_categories):
    final_df = pd.DataFrame(data=matrix)
    final_df['KEYWORD'] = vectors_df.TEXT
    cols = ['KEYWORD'] + [i for i in range(no_of_categories)]
    colnames = ['KEYWORD'] + [i for i in range(1, no_of_categories+1)]
    final_df = final_df.loc[:, cols]
    final_df.columns = colnames
    return final_df

In [14]:
def get_test_indices(df, annotated_size, aspect_categories):
    temp_df = df.iloc[train_size:annotated_size, :]
    return temp_df[temp_df.ASPECT_CATEGORY_NAME.isin(aspect_categories)].index

In [15]:
def get_accuracy(model, keywords_df, test_indices):
    sum = (model.PREDICTION[test_indices] == keywords_df.ASPECT_CATEGORY_NAME[test_indices]).sum()
    print("Accuracy: {}".format(sum/len(test_indices)))