In [5]:

from tika import parser
import os, editdistance, itertools, argparse, csv
from requests import ConnectionError
from time import sleep
import ast
import pandas as pd
import json
from functools import reduce

In [2]:

import math

def stringify(attribute_value):
    if isinstance(attribute_value, list):
        return str((", ".join(attribute_value)).strip())
    else:
        return str(attribute_value.strip())


class Vector:
    '''
    An instance of this class represents a vector in n-dimensional space
    '''
    
    def __init__(self, filename=None, features=None, config_params=None):
        '''
        Create a vector
        @param metadata features 
        '''
        self.features = {}
        
        if filename and features:
            self.filename = filename #filename is basically id for the vector

            if(config_params):
                for key in config_params:
                    if(key in features):
                        if config_params[key] == "string":
                            self.features[key] = hash(stringify(features[key]))
                        elif config_params[key] == "int":
                            self.features[key] = int(features[key])
                        elif config_params[key] == "double":
                            # print(key+" "+features[key])
                            self.features[key] = float(features[key])
                        elif config_params[key] == "date":
                            try:
                                self.features[key] = int(d.strptime(features[key],"%Y-%m-%d").strftime('%s'))
                            except:
                                self.features[key] = int(features[key])
            else:
                na_metadata = ["resourceName"]

                for na in na_metadata:
                    features.pop(na, None)

                for key,value in features.items():
                    self.features[key] = len(stringify(value))


    '''
    def __str__(self):        
        vector_str = "( {0} ): \n".format(self.)
        if self.features:
            for key in self.features:
                vector_str += " {1}: {2} \n".format(key, self.features[key])
        return vector_str+"\n"
    '''

    def getMagnitude(self):
        totalMagnitude = 0.0
        for key in self.features:
            totalMagnitude += self.features[key] ** 2
        return math.sqrt(totalMagnitude)


    def dotProduct(self, anotherVector):
        '''
        A = ax+by+cz
        B = mx+ny+oz
        A.B = a*m + b*n + c*o
        '''        
        dot_product = 0.0
        intersect_features = set(self.features) & set(anotherVector.features)
        
        for feature in intersect_features:
            dot_product += self.features[feature] * anotherVector.features[feature]
        return dot_product


    def cosTheta(self, v2):
        '''
        cosTheta = (V1.V2) / (|V1| |V2|)
        cos 0 = 1 implies identical documents
        '''
        return self.dotProduct(v2) / (self.getMagnitude() * v2.getMagnitude())


    def euclidean_dist(self, anotherVector):
        '''
        dist = ((x1-x2)^2 + (y1-y2)^2 + (z1-z2)^2)^(0.5)
        '''
        intersect_features = set(self.features) & set(anotherVector.features)

        dist_sum = 0.0
        for feature in intersect_features:
            dist_sum += (self.features[feature] - anotherVector.features[feature]) ** 2

        setA = set(self.features) - intersect_features
        for feature in setA:
            dist_sum += self.features[feature] ** 2

        setB = set(anotherVector.features) - intersect_features
        for feature in setB:
            dist_sum += anotherVector.features[feature] ** 2

        return math.sqrt(dist_sum)


In [8]:
inputDir = 'SisiHTML/'
na_metadata = ["resourceName"]
filename_list = []

for root,_, files in os.walk(inputDir):
    for filename in files:
        filename_list.append(filename)

All_features = {}
for filename in filename_list:
    All_features[filename] = parser.from_file(inputDir + filename)["metadata"]

data_cosine = []
data_jaccard = []
data_edit = []
files_tuple = itertools.combinations(filename_list, 2)
for file1, file2 in files_tuple:
    features = All_features[file1]
    features2 = All_features[file2]


    # Cosine ******************************************************************************************
    try:
        v1 = Vector(file1, features)
        v2 = Vector(file2, features2)
        data_cosine.append([file1, file2, v1.cosTheta(v2)])
    except:
        pass

    # Jaccard ******************************************************************************************
    try:
        f1MetaData = features
        f2MetaData = features2
        isCoExistant = lambda k: ( k in f2MetaData) and ( f1MetaData[k] == f2MetaData[k] )
        intersection = reduce(lambda m,k: (m + 1) if isCoExistant(k) else m, list(f1MetaData.keys()), 0)
        union = len(list(f1MetaData.keys())) + len(list(f2MetaData.keys())) - intersection
        jaccard = float(intersection) / union
        data_jaccard.append([file1, file2, jaccard])
    except:
        pass

    # Edit   ******************************************************************************************
    try:
        file1_parsedData = features
        file2_parsedData = features2
        intersect_features = set(file1_parsedData.keys()) & set(file2_parsedData.keys()) 
        intersect_features = [feature for feature in intersect_features if feature not in na_metadata ]
        file_edit_distance = 0.0
        for feature in intersect_features:
            file1_feature_value = stringify(file1_parsedData[feature])
            file2_feature_value = stringify(file2_parsedData[feature])
            if len(file1_feature_value) == 0 and len(file2_feature_value) == 0:
                feature_distance = 0.0
            else:
                feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value))
            file_edit_distance += feature_distance
        if allKeys:
            file1_only_features = set(file1_parsedData.keys()) - set(intersect_features)
            file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata]

            file2_only_features = set(file2_parsedData.keys()) - set(intersect_features)
            file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata]

            file_edit_distance += len(file1_only_features) + len(file2_only_features)       # increment by 1 for each disjunct feature in (A-B) & (B-A), file1_disjunct_feature_value/file1_disjunct_feature_value = 1
            file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features))

        else:
            file_edit_distance /= float(len(intersect_features))    #average edit distance
        data_edit.append([file1, file2, 1-file_edit_distance])
    except:
        pass

df_cosine = pd.DataFrame(data=data_cosine,columns=["x-coordinate","y-coordinate","Similarity_score"])
df_jaccard = pd.DataFrame(data=data_jaccard,columns=["x-coordinate","y-coordinate","Similarity_score"])
df_edit = pd.DataFrame(data=data_edit,columns=["x-coordinate","y-coordinate","Similarity_score"])

In [9]:
df_cosine.to_csv('cosine_similarity.csv',index=False)
df_jaccard.to_csv('jaccard_similarity.csv',index=False)
df_edit.to_csv('edit_similarity.csv',index=False)