## SEO Ranker using ITC
#### By Vedank Goyal 2K18/MC/122

### Import libraries

In [1]:
# For extracting data
import requests
from bs4 import BeautifulSoup

# For analysing data
from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist
tokenizer = RegexpTokenizer(r'\w+')

# For calculation
import math

### Add sites to rank

In [2]:
sites = []
sites.append("https://www.healthline.com/nutrition/10-health-benefits-of-apples#:~:text=Apples%20are%20an%20incredibly%20nutritious,improve%20gut%20and%20brain%20health.")
sites.append("https://www.eatingwell.com/article/17769/5-health-benefits-of-an-apple/")
sites.append("https://www.everydayhealth.com/diet-nutrition/impressive-health-benefits-of-apples/")
sites.append("https://www.medicalnewstoday.com/articles/267290")
sites.append("https://www.insider.com/benefits-of-apples")
sites.append("https://www.bbcgoodfood.com/howto/guide/health-benefits-apples")
sites.append("https://www.news-medical.net/health/What-Are-the-Health-Benefits-of-Apples.aspx")
sites.append("https://minnetonkaorchards.com/health-benefits-of-apples/")

### Define data extracting functions

In [3]:
def get_data(site):
    req = requests.get(site)
    soup = BeautifulSoup(req.content,"html.parser")
    return soup.get_text()

In [4]:
def get_word_counts(site_text):
    tokens = tokenizer.tokenize(site_text)
    freq = FreqDist(tokens)
    return tokens,freq

### Define data analysing functions

In [5]:
def update_key(key,required_frequencies,frequency):
    if key in required_frequencies.keys():
        required_frequencies[key] = required_frequencies[key] + frequency
    else:
        required_frequencies[key] = frequency
    return required_frequencies

In [6]:
def get_required_frequencies(required_words,word_frequencies):
    required_frequencies = {}
    total_count = 0
    for key in word_frequencies.keys():
        word = key.lower()
        if word in required_words:
            required_frequencies = update_key(word,required_frequencies,word_frequencies[key])
            total_count = total_count + word_frequencies[key]
    for word in required_frequencies:
        if word not in required_frequencies.keys():
            required_frequencies[word] = 0
    return required_frequencies,total_count

In [7]:
def get_probabilities(required_frequencies):
    probabilites = {}
    frequencies = required_frequencies[0]
    count = required_frequencies[1]
    for key in frequencies.keys():
        probabilites[key] = frequencies[key]/count
    return probabilites

### Define function to calculate average entropy

In [8]:
def calculate_average_entropy(probabilites):
    average_entropy = 0
    for key in probabilites.keys():
        p = probabilites[key]
        if p != 0:
            average_entropy = average_entropy - p*math.log(p,2.0)
    return average_entropy

### Create list of expected words

In [9]:
expected_words = ["apple","vitamin","health","apples","gut","bacteria","bowel","bones","fiber","blood","cholesterol","diabetes","immune","cancer","heart","bone"]
expected_words

['apple',
 'vitamin',
 'health',
 'apples',
 'gut',
 'bacteria',
 'bowel',
 'bones',
 'fiber',
 'blood',
 'cholesterol',
 'diabetes',
 'immune',
 'cancer',
 'heart',
 'bone']

### Get Entropies

In [12]:
site_entropies = []
for site in sites:
    words_data = get_word_counts(get_data(site))
    filter_words = dict([(m, n) for m, n in words_data[1].items() if len(m) >= 1])
    required_frequencies = get_required_frequencies(expected_words,filter_words)
    probabilites = get_probabilities(required_frequencies)
    average_entropy = calculate_average_entropy(probabilites)
    site_entropies.append(average_entropy)

In [13]:
site_entropies

[3.2379856531716693,
 3.051202097711677,
 3.154489619331466,
 3.1078686169163063,
 3.3364908890460483,
 2.5683215021992103,
 3.036339074111859,
 2.891773912162458]