In [1]:
import numpy as np
import pandas as pd
import random

In [19]:
path_to_text = 'vectors.txt'
path_to_excel = '20200304_Aspects_Annotation.xlsx'
train_size = 400
annotated_size = 500
count_limit = 10

def get_final_dfs(
    path_to_text,
    path_to_excel,
    train_size,
    annotated_size,
    count_limit
):
    vectors_df, keywords_df = get_parsed_dfs(path_to_text, path_to_excel, count_limit=5)
    aspect_categories = get_aspect_categories(keywords_df)
    aspect_categories_numbered = dict(zip(range(1, 12), aspect_categories))
    no_of_categories = len(aspect_categories)
    matrix = get_matrix(vectors_df)
    centroids = get_centroids(keywords_df, matrix, aspect_categories, train_size)
    print("\nCalculating distances and similarities...")
    manhattan_distances, euclidean_distances, cos_sim, corr = get_distances(
        centroids, 
        matrix, 
        no_of_categories
    )
    manhattan_df = parse_to_df(manhattan_distances, vectors_df, no_of_categories)
    euclidean_df = parse_to_df(euclidean_distances, vectors_df, no_of_categories)
    cos_sim_df = parse_to_df(cos_sim, vectors_df, no_of_categories)
    corr_df = parse_to_df(corr, vectors_df, no_of_categories)
    
    def get_closest_index(row):
        dictionary = dict(zip(row, list(range(1, no_of_categories+1))))
        return dictionary[min(row)]

    def get_furthest_index(row):
        dictionary = dict(zip(row, list(range(1, no_of_categories+1))))
        return dictionary[max(row)]
    
    for df in [manhattan_df, euclidean_df]:
        df['CLOSEST'] = df.iloc[:, 1:no_of_categories+1].apply(
            get_closest_index, 
            axis=1
        )
        df['PREDICTION'] = [aspect_categories_numbered[x] for x in df['CLOSEST']]

    for df in [cos_sim_df, corr_df]:
        df['CLOSEST'] = df.iloc[:, 1:no_of_categories+1].apply(
            get_furthest_index, 
            axis=1
        )
        df['PREDICTION'] = [aspect_categories_numbered[x] for x in df['CLOSEST']]
    
    test_indices = get_test_indices(keywords_df, annotated_size, aspect_categories)
    for df in [manhattan_df, euclidean_df, cos_sim_df, corr_df]:
        get_accuracy(df, keywords_df, test_indices)
        
    return manhattan_df, euclidean_df, cos_sim_df, corr_df

In [20]:
manhattan_df, euclidean_df, cos_sim_df, corr_df = get_final_dfs(
    path_to_text,
    path_to_excel,
    train_size,
    annotated_size,
    count_limit
)

Activities: 14
Environment and Ambience: 24
Experience: 24
F&B: 19
Facilities: 6
Family/Children family/ Friends: 13
Price: 13
Service: 3
Shopping: 7
Sightseeing/ Place: 103
Transportation and Accessibility: 51

Centroids found.

Calculating distances and similarities...
Accuracy: 0.5483870967741935
Accuracy: 0.4838709677419355
Accuracy: 0.5161290322580645
Accuracy: 0.2903225806451613


In [22]:
euclidean_df[700:720]

Unnamed: 0,KEYWORD,1,2,3,4,5,6,7,8,9,10,11,CLOSEST,PREDICTION
700,different plants,5.260005,4.563278,3.967289,5.806072,5.418431,5.062933,4.820222,6.403576,6.239781,4.258522,4.783227,3,Experience
701,shrubs,5.169435,4.578803,5.032909,5.548281,5.531732,5.359091,5.237475,6.3739,6.126577,4.757139,5.01272,2,Environment and Ambience
702,sounds,5.130052,5.23977,5.323198,5.927317,6.345733,5.74291,5.778754,6.506022,6.300673,5.317962,5.479929,1,Activities
703,stall,5.79686,5.95744,5.741142,5.560033,6.377128,6.273398,5.692416,6.904512,6.089578,5.869808,5.663526,4,F&B
704,buddha tooth relic temple,5.973711,6.215932,5.531532,6.082003,6.25902,6.137618,5.842109,6.846216,6.094975,5.616071,5.652267,3,Experience
705,smells,5.548792,5.586248,5.607122,6.085166,6.460952,6.002264,5.974671,6.508492,6.46695,5.576082,5.781392,1,Activities
706,run,5.255468,4.480879,4.991477,5.485503,5.760556,5.283703,5.465438,6.206218,5.876988,4.842252,4.892129,2,Environment and Ambience
707,cruise,5.012101,4.965897,4.854406,5.060161,5.542567,5.403687,5.112926,6.360828,5.760621,4.775516,4.761683,11,Transportation and Accessibility
708,gifts,5.070776,5.069529,5.070037,5.051044,5.731589,5.030979,4.94601,5.960251,4.4169,4.954107,4.891428,9,Shopping
709,pod,5.250279,5.143562,5.006926,5.508982,5.228429,5.344536,5.211844,6.035297,6.220108,5.022869,4.834352,11,Transportation and Accessibility


In [3]:
def get_vectors_df(path_to_text): 
    vectors_df = pd.read_csv(path_to_text, delim_whitespace=True, header=None)
    parsed_col = []
    for keyword in list(vectors_df[0]):
        keyword = str(keyword)
        keyword = keyword.replace('_', ' ')
        parsed_col.append(keyword)
    vectors_df[0] = parsed_col
    vectors_df.columns = ['TEXT'] + list(range(1, vectors_df.shape[1]))
    return vectors_df

In [4]:
def get_keywords_df(path_to_excel, count_limit): 
    keywords_df = pd.read_excel(path_to_excel)
    keywords_df = keywords_df[keywords_df['count'] >= count_limit]
    parsed_col_2 = []
    for keyword in list(keywords_df['TEXT']):
        keyword = str(keyword)
        keyword = keyword.replace('â€™', "'")
        keyword = keyword.replace('ã©', 'é')
        parsed_col_2.append(keyword)
    keywords_df['TEXT'] = parsed_col_2
    keywords_df.drop_duplicates('TEXT', inplace=True) # There are duplicates after the .replace() functions
    return keywords_df

In [5]:
def get_parsed_dfs(path_to_text, path_to_excel, count_limit=5):
    vectors_df = get_vectors_df(path_to_text)
    keywords_df = get_keywords_df(path_to_excel, count_limit=count_limit)
    joint_df = vectors_df.merge(keywords_df)
    vectors_df = joint_df.iloc[:, :vectors_df.shape[1]]
    keywords_df = joint_df.iloc[:, vectors_df.shape[1]:]
    return vectors_df, keywords_df

In [6]:
def get_matrix(vectors_df):
    return vectors_df.iloc[:, 1:].values

In [7]:
def get_centroid(matrix):
    return np.mean(matrix, axis=0)

In [8]:
def get_manhattan(a, b):
    return np.abs(a - b).sum()

def get_euclidean(a, b):
    return np.linalg.norm(a-b)

def get_cos_sim(a, b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

def get_corr(a, b):
    return np.correlate(a, b)[0]

In [9]:
def get_aspect_categories(keywords_df):
    aspect_categories = list(set(keywords_df.ASPECT_CATEGORY_NAME))
    aspect_categories = [x for x in aspect_categories if x is not np.nan]
    aspect_categories.sort()
    return aspect_categories

In [10]:
def get_centroids(keywords_df, matrix, aspect_categories, train_size=400):
    df_train = keywords_df.iloc[:train_size, :]
    no_of_categories = len(aspect_categories)
    centroids = np.zeros((no_of_categories, 300))
    for i in range(no_of_categories):
        indices = df_train[df_train.ASPECT_CATEGORY_NAME == aspect_categories[i]].index
        print('{}: {}'.format(aspect_categories[i], len(indices)))
        centroids[i] = get_centroid(matrix[indices])
    print('\nCentroids found.')
    return centroids

In [11]:
def get_distances(centroids, matrix, no_of_categories):  # Returns tuple of 4 matrices
    manhattan_distances = np.zeros((matrix.shape[0], no_of_categories))
    euclidean_distances = np.zeros((matrix.shape[0], no_of_categories))
    cos_sim = np.zeros((matrix.shape[0], no_of_categories))
    corr = np.zeros((matrix.shape[0], no_of_categories))

    for i in range(matrix.shape[0]):
        for j in range(no_of_categories):
            manhattan_distances[i, j] = get_manhattan(matrix[i], centroids[j])

    for i in range(matrix.shape[0]):
        for j in range(no_of_categories):
            euclidean_distances[i, j] = get_euclidean(matrix[i], centroids[j])

    for i in range(matrix.shape[0]):
        for j in range(no_of_categories):
            cos_sim[i, j] = get_cos_sim(matrix[i], centroids[j])

    for i in range(matrix.shape[0]):
        for j in range(no_of_categories):
            corr[i, j] = get_corr(matrix[i], centroids[j])
                
    return manhattan_distances, euclidean_distances, cos_sim, corr

In [12]:
def parse_to_df(matrix, vectors_df, no_of_categories):
    final_df = pd.DataFrame(data=matrix)
    final_df['KEYWORD'] = vectors_df.TEXT
    cols = ['KEYWORD'] + [i for i in range(no_of_categories)]
    colnames = ['KEYWORD'] + [i for i in range(1, no_of_categories+1)]
    final_df = final_df.loc[:, cols]
    final_df.columns = colnames
    return final_df

In [13]:
def get_test_indices(df, annotated_size, aspect_categories):
    temp_df = df.iloc[train_size:annotated_size, :]
    return temp_df[temp_df.ASPECT_CATEGORY_NAME.isin(aspect_categories)].index

In [14]:
def get_accuracy(model, keywords_df, test_indices):
    sum = (model.PREDICTION[test_indices] == keywords_df.ASPECT_CATEGORY_NAME[test_indices]).sum()
    print("Accuracy: {}".format(sum/len(test_indices)))