In [1]:
import pandas as pd
from scipy.stats import entropy
import numpy as np
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/kevin/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /Users/kevin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def get_article_text(url):
    response = requests.get(url)
    html_content = response.content
    
    soup = BeautifulSoup(html_content, "html.parser")
    article_divs = soup.find("div", class_="page-content")
    article_ps = article_divs.find_all("p")
    article_text = ""
    for p in article_ps:
        article_text += p.get_text()
        
    return article_text

In [3]:
def find_keywords(text):
    # Tokenize the text into individual words
    tokens = word_tokenize(text)

    # Remove stopwords (common words that may not carry much meaning)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]
    tokens = [token for token in tokens if token not in string.punctuation]
    
    total = len(tokens)
    
    # Perform frequency analysis
    freq_dist = FreqDist(tokens)

    return tokens

In [4]:
checkwords = ['Republican', 'candidate', 'race']

input_text = get_article_text("https://www.politico.com/news/2023/05/27/california-republicans-race-feinstein-senate-seat-00099053")
keywords = find_keywords(input_text)
first_article = [keywords.count(word) / len(keywords) for word in checkwords]

input_text = get_article_text("https://www.politico.com/news/2023/05/26/desantis-campaign-5-takeaways-00099070")
keywords = find_keywords(input_text)
second_article = [keywords.count(word) / len(keywords) for word in checkwords]

In [7]:
input_text

'\nElections\nDeSantis raised $8.2 million in his first 24 hours, while moving in conservative media to carve out a lane to Donald Trump’s right.\nFlorida Gov. Ron DeSantis’ team said he took in $8.2 million during the first 24 hours after his Twitter launch. | Chris Delmas/AFP via Getty Images\nBy Sally Goldenberg\n05/26/2023 05:53 PM EDT\nLink CopiedRon DeSantis bounced back from his glitchy launch event with a hefty fundraising haul and a flood of media appearances. The Florida governor’s first 36 hours in the race revealed a lot about what kind of candidate he’s trying to be — more conservative than Donald Trump but cautious of offending the former president’s most die-hard supporters.Here are five takeaways from the first day-plus on the trail for the newest entrant into the Republican presidential primary:A swirl of speculation leading up to DeSantis’ launch crystallized into one big, existential question: How would he handle Trump — his political benefactor-turned-primary-rival 

In [6]:
keywords

['Elections',
 'DeSantis',
 'raised',
 '8.2',
 'million',
 'first',
 '24',
 'hours',
 'moving',
 'conservative',
 'media',
 'carve',
 'lane',
 'Donald',
 'Trump',
 '’',
 'right',
 'Florida',
 'Gov',
 'Ron',
 'DeSantis',
 '’',
 'team',
 'said',
 'took',
 '8.2',
 'million',
 'first',
 '24',
 'hours',
 'Twitter',
 'launch',
 'Chris',
 'Delmas/AFP',
 'via',
 'Getty',
 'Images',
 'Sally',
 'Goldenberg',
 '05/26/2023',
 '05:53',
 'PM',
 'EDT',
 'Link',
 'CopiedRon',
 'DeSantis',
 'bounced',
 'back',
 'glitchy',
 'launch',
 'event',
 'hefty',
 'fundraising',
 'haul',
 'flood',
 'media',
 'appearances',
 'Florida',
 'governor',
 '’',
 'first',
 '36',
 'hours',
 'race',
 'revealed',
 'lot',
 'kind',
 'candidate',
 '’',
 'trying',
 '—',
 'conservative',
 'Donald',
 'Trump',
 'cautious',
 'offending',
 'former',
 'president',
 '’',
 'die-hard',
 'supporters.Here',
 'five',
 'takeaways',
 'first',
 'day-plus',
 'trail',
 'newest',
 'entrant',
 'Republican',
 'presidential',
 'primary',
 'swirl',
 

In [9]:
first_article

[0.014577259475218658, 0.01020408163265306, 0.013119533527696793]

In [33]:
kl_divergence = entropy(first_article, second_article)
kl_divergence

0.010602026390997967

In [35]:
m = 0.5 * (np.array(first_article) + np.array(second_article))

kl_pm = entropy(first_article, m)

kl_qm = entropy(second_article, m)

jsd = 0.5 * (kl_pm + kl_qm)
jsd

0.0038854280869009264