In [1]:
from html.parser import HTMLParser  
from urllib.request import urlopen  
from urllib import parse
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re, math
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


class LinkParser(HTMLParser):

    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            for (key, value) in attrs:
                if key == 'href':
                    newUrl = parse.urljoin(self.baseUrl, value)
                    self.links = self.links + [newUrl]

    def getLinks(self, url):
        self.links = []
        self.baseUrl = url
        response = urlopen(url)
        if response.getheader('Content-Type')=='text/html':
            htmlBytes = response.read()
            htmlString = htmlBytes.decode("utf-8")
            self.feed(htmlString)
            return htmlString, self.links
        else:
            return "",[]

WORD = re.compile(r'\w+')
cosine_vals = []

def get_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

def text_to_vector(text):
     words = WORD.findall(text)
     return Counter(words)
 
    
    
def spider(url, maxPages):
    pagesToVisit = [url]
    numberVisited = 0
    print(url)
    print(maxPages)
    
    query = "We make sure website fast , secure & always -so visitors & search engines trust"
    while numberVisited < maxPages and pagesToVisit != []:# and not foundWord:
        numberVisited = numberVisited + 1
        url = pagesToVisit[0]
        pagesToVisit = pagesToVisit[1:]
        
        print(str(numberVisited) + ". Visiting:", url)
        parser = LinkParser()
        data, links = parser.getLinks(url)
        web_data = BeautifulSoup(data,'lxml')    
        
        complete_text = ""
        for para in web_data.find_all('p'):
            complete_text = complete_text + para.text
        
        pagesToVisit = pagesToVisit + links
        
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(complete_text)
        
        filtered_sentence = []
        
        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.append(w)
        
        filename = "File " + str(numberVisited) + ".txt"
        f = open(filename, "w+")
        f.write(" ".join(str(x) for x in filtered_sentence))
        f.close()
        
        
        vector1 = text_to_vector(query)
        vector2 = text_to_vector(" ".join(str(x) for x in filtered_sentence))
        
        cosine = get_cosine(vector1, vector2)
        
        print('Cosine:', cosine)
        
        cosine_vals.append([cosine, url])

def print_cosine():
    print(cosine_vals)

def print_decreasing():
    cosine_vals.sort(reverse = True)
    
    for url in cosine_vals:
        print(url[1])        
        
def calculate_similarity(maxPages, query):
    # Bring in standard stopwords
    stopWords = stopwords.words('english')
    
    print ("\nCalculating document similarity scores...")

    train_set = [query]
    for i in range(maxPages):
        # Open and read a bunch of files 
        f = open('File ' + str(i+1) + '.txt')
        doc = str(f.read())
        train_set.append(doc)

    # Set up the vectoriser, passing in the stop words
    tfidf_vectorizer = TfidfVectorizer(stop_words=stopWords)
    
    # Apply the vectoriser to the training set
    tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set)
    
    # Print the score
    print ("\nSimilarity Score [*] ",cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train))

In [None]:
#spider("url", maxPages)
spider("http://www.dreamhost.com", 5)