In [22]:
import urllib.request
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import numpy as np
import re
from math import log
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
porter=PorterStemmer()

def remove_stopwords(tokens):
    sr=stopwords.words('english')
    clean_tokens=tokens[:]
    for token in tokens:
        if token in sr:
            clean_tokens.remove(token)
    return clean_tokens

def termFrequency(query,text):
    normalizeTermFreq=text.lower().split()  
    term_in_document=normalizeTermFreq.count(query.lower())  
    len_of_document=float(len(normalizeTermFreq))  
    normalized_tf=term_in_document/len_of_document  
  
    return normalized_tf 
    
def inverseDocumentFrequency(query,text):
    num_docs_with_given_term=0
    if query in text: 
        num_docs_with_given_term+=1
  
    if num_docs_with_given_term>0: 
        total_num_docs=len(text)    
        idf_val=log(float(total_num_docs) / num_docs_with_given_term) 
        return idf_val 
    else: 
        return 0
    
def index_url(url_list,query):
    ranking_results={}
    for url in url_list:
        url_file=urllib.request.urlopen(url)
        url_data=url_file.readlines()
        #print(url_data)
        url_file=urllib.request.urlopen(url)
        url_str = url_file.read().lower()
        url_wordlist = url_str.split()
    
    
        #to extract raw data from web pages
        soup=BeautifulSoup(url_str,'html5lib')
        text=soup.get_text(strip=True)
        #print(text)
    
        #to get clean data
        clean_text=re.sub("[^a-zA-Z]"," ",text)
        #print(clean_text)
    
        tokens=[t for t in clean_text.split()]
        #print(tokens)
    
        #removal of stopwords
        clean_tokens=remove_stopwords(tokens)
        #print(clean_tokens)
    
        #stemming
        stem_url=[]
        for word in tokens:
            stem_url.append(porter.stem(word))
            stem_url.append(" ")
        #print("".join(stem_url))
        
        #storing bag of words for each page
        X=vectorizer.fit_transform(clean_tokens) 
        #print(X.toarray())
        #print(vectorizer.get_feature_names())
    
        tf=termFrequency(query,text)
        idf=inverseDocumentFrequency(query,text)
    
        scoring=tf*idf
        #print(scoring)
    
        ranking_results[scoring]=url
    
    for i in sorted(ranking_results,reverse=True):
        print(ranking_results[i],end="\n")


query="dog"
url_list=['https://en.wikipedia.org/wiki/Dog','https://en.wikipedia.org/wiki/Horse','https://en.wikipedia.org/wiki/Mammal',
          'https://en.wikipedia.org/wiki/Cat','https://en.wikipedia.org/wiki/Lion','https://en.wikipedia.org/wiki/Tiger',
          'https://en.wikipedia.org/wiki/Cheetah','https://en.wikipedia.org/wiki/Leopard','https://en.wikipedia.org/wiki/Elephant',
         'https://en.wikipedia.org/wiki/Wolf']
index_url(url_list,query)

https://en.wikipedia.org/wiki/Dog
https://en.wikipedia.org/wiki/Wolf
https://en.wikipedia.org/wiki/Cat
https://en.wikipedia.org/wiki/Cheetah
https://en.wikipedia.org/wiki/Leopard
https://en.wikipedia.org/wiki/Tiger
https://en.wikipedia.org/wiki/Lion
https://en.wikipedia.org/wiki/Mammal
https://en.wikipedia.org/wiki/Horse
https://en.wikipedia.org/wiki/Elephant
