In [20]:

from tika import parser
from vector import Vector
import os, itertools, argparse, csv
from requests import ConnectionError
from time import sleep
import ast
import pandas as pd
import json


In [34]:

import math

def stringify(attribute_value):
    if isinstance(attribute_value, list):
        return str((", ".join(attribute_value)).strip())
    else:
        return str(attribute_value.strip())


class Vector:
    '''
    An instance of this class represents a vector in n-dimensional space
    '''
    
    def __init__(self, filename=None, features=None, config_params=None):
        '''
        Create a vector
        @param metadata features 
        '''
        self.features = {}
        
        if filename and features:
            self.filename = filename #filename is basically id for the vector

            if(config_params):
                for key in config_params:
                    if(key in features):
                        if config_params[key] == "string":
                            self.features[key] = hash(stringify(features[key]))
                        elif config_params[key] == "int":
                            self.features[key] = int(features[key])
                        elif config_params[key] == "double":
                            # print(key+" "+features[key])
                            self.features[key] = float(features[key])
                        elif config_params[key] == "date":
                            try:
                                self.features[key] = int(d.strptime(features[key],"%Y-%m-%d").strftime('%s'))
                            except:
                                self.features[key] = int(features[key])
            else:
                na_metadata = ["resourceName"]

                for na in na_metadata:
                    features.pop(na, None)

                for key,value in features.items():
                    self.features[key] = len(stringify(value))


    '''
    def __str__(self):        
        vector_str = "( {0} ): \n".format(self.)
        if self.features:
            for key in self.features:
                vector_str += " {1}: {2} \n".format(key, self.features[key])
        return vector_str+"\n"
    '''

    def getMagnitude(self):
        totalMagnitude = 0.0
        for key in self.features:
            totalMagnitude += self.features[key] ** 2
        return math.sqrt(totalMagnitude)


    def dotProduct(self, anotherVector):
        '''
        A = ax+by+cz
        B = mx+ny+oz
        A.B = a*m + b*n + c*o
        '''        
        dot_product = 0.0
        intersect_features = set(self.features) & set(anotherVector.features)
        
        for feature in intersect_features:
            dot_product += self.features[feature] * anotherVector.features[feature]
        return dot_product


    def cosTheta(self, v2):
        '''
        cosTheta = (V1.V2) / (|V1| |V2|)
        cos 0 = 1 implies identical documents
        '''
        return self.dotProduct(v2) / (self.getMagnitude() * v2.getMagnitude())


    def euclidean_dist(self, anotherVector):
        '''
        dist = ((x1-x2)^2 + (y1-y2)^2 + (z1-z2)^2)^(0.5)
        '''
        intersect_features = set(self.features) & set(anotherVector.features)

        dist_sum = 0.0
        for feature in intersect_features:
            dist_sum += (self.features[feature] - anotherVector.features[feature]) ** 2

        setA = set(self.features) - intersect_features
        for feature in setA:
            dist_sum += self.features[feature] ** 2

        setB = set(anotherVector.features) - intersect_features
        for feature in setB:
            dist_sum += anotherVector.features[feature] ** 2

        return math.sqrt(dist_sum)


In [35]:
# computeScores('htmls/', 'test.csv', 'html')

inputDir = 'htmls/'
acceptTypes = 'html'
outCSV = 'test.csv'
filename_list = []

for root, dirnames, files in os.walk(inputDir):
    dirnames[:] = [d for d in dirnames if not d.startswith('.')]
    for filename in files:
        filename_list.append(filename)

data = []
files_tuple = itertools.combinations(filename_list, 2)
for file1, file2 in files_tuple:
    try:
        row_cosine_distance = [file1, file2]
    
        features = parser.from_file('htmls/' +file1)["metadata"]
        features2 = parser.from_file('htmls/' +file2)["metadata"]
    
        v1 = Vector(file1, features)
        v2 = Vector(file2, features2)
    

        row_cosine_distance.append(v1.cosTheta(v2))     
        # a.writerow(row_cosine_distance)  
        data.append(row_cosine_distance)
        # break
    except Exception as e:
        continue

df = pd.DataFrame(data=data,columns=["x-coordinate","y-coordinate","Similarity_score"])

In [36]:
df.to_csv('cosine_similarity.csv',index=False)