In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, cmudict
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import nltk

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('cmudict')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [3]:
def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    paragraphs = soup.find_all('p')
    text = ' '.join([p.get_text() for p in paragraphs])
    return text

In [4]:
def syllable_count_word(word, cmu_dict):
    if word.lower() in cmu_dict:
        return max([len(list(y for y in x if y[-1].isdigit())) for x in cmu_dict[word.lower()]])
    # If word not found in CMU dictionary, estimate syllable count based on number of vowels
    return max(1, sum(1 for c in word.lower() if c in 'aeiou'))

In [5]:
def perform_text_analysis(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Removing stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word.isalnum()]
    # Word count
    word_count = len(filtered_tokens)
    # Average word length
    avg_word_length = sum(len(word) for word in filtered_tokens) / len(filtered_tokens) if len(filtered_tokens) > 0 else 0
    # Sentiment analysis
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    positive_score = sentiment['pos']
    negative_score = sentiment['neg']
    polarity_score = sentiment['compound']
    # Subjectivity analysis
    subjectivity_score = TextBlob(text).sentiment.subjectivity
    # Sentence analysis
    sentences = sent_tokenize(text)
    avg_sentence_length = sum(len(sent.split()) for sent in sentences) / len(sentences) if len(sentences) > 0 else 0
    avg_words_per_sentence = sum(len(word_tokenize(sent)) for sent in sentences) / len(sentences) if len(sentences) > 0 else 0
    # Complex word analysis
    cmu_dict = cmudict.dict()
    complex_word_count = 0
    syllable_count = 0
    for word in filtered_tokens:
        syllable_count += syllable_count_word(word, cmu_dict)
        if syllable_count_word(word, cmu_dict) >= 3:
            complex_word_count += 1
    percentage_complex_words = (complex_word_count / word_count) * 100 if word_count > 0 else 0
    # FOG index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    # Personal pronouns count
    personal_pronouns = ['I', 'my', 'we', 'us', 'ours']
    personal_pronoun_count = sum(1 for word in filtered_tokens if word.lower() in personal_pronouns)
    return positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length, \
           percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count, word_count, \
           syllable_count / word_count, personal_pronoun_count, avg_word_length

In [6]:
url = 'https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-by-the-year-2040/'

In [7]:
txt = extract_text_from_url(url)

In [8]:
txt

'Efficient AWS Infrastructure Setup and Management: Addressing Security, Scalability, and Compliance Streamlined Equity Waterfall Calculation and Deal Management System Automated Orthopedic Case Report Generation: Harnessing Web Scraping and AI Integration Streamlining Time Calculation in Warehouse Management: Leveraging ShipHero API and Google BigQuery Integration Methodology for ETL Discovery Tool using LLMA, OpenAI, Langchain Methodology for database discovery tool using openai, LLMA, Langchain Chatbot using VoiceFlow How To Secure (SSL) Nginx with Let’s Encrypt on Ubuntu (Cloud VM, GCP, AWS, Azure, Linode) and Add Domain Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040. Rising IT Cities and Their Impact on the Economy, Environment, Infrastructure, and City Life in Future Internet Demand’s Evolution, Communication Impact, and 2035’s Alternative Pathways Rise of Cybercrime and its Effect in upcoming Future AI/ML and Predictiv

In [9]:
perform_text_analysis(txt)

(0.148,
 0.117,
 0.9905,
 0.4979332495461528,
 21.142857142857142,
 47.86780383795309,
 27.604264392324094,
 23.833333333333332,
 449,
 938,
 2.4840085287846483,
 1,
 7.5799573560767595)