In [9]:
import nltk

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
import requests
from bs4 import BeautifulSoup
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from heapq import nlargest

def get_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.get_text()
    return text

def tokenize_and_count_words(text):
    # Tokenization
    words = re.findall(r'\b\w+\b', text.lower())

    # Remove stopwords and punctuations
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalpha() and word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    # Word count
    word_count = Counter(words)
    return word_count

def perform_text_summarization(text, num_sentences=3):
    # Remove extra whitespaces and newlines
    text = re.sub(r'\s+', ' ', text)

    # Split text into sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)

    # Calculate sentence scores based on word frequency
    word_count = tokenize_and_count_words(text)
    sentence_scores = {sentence: sum(word_count[word] for word in re.findall(r'\b\w+\b', sentence.lower()) if word.isalpha()) for sentence in sentences}

    # Select top-ranked sentences for summary
    summary_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)
    return summary

# URL of the social media web page to scrape
url = 'https://www.geeksforgeeks.org/python-program-crawl-web-page-get-frequent-words/'  # Replace with the actual URL

# Get text from the web page
text = get_text_from_url(url)

# Tokenization and word count
word_count = tokenize_and_count_words(text)
print("Word Count:")
wcount = 0
for word, count in word_count.items():
    wcount+=1
    print(f"{word}: {count}")

# Text summarization
summary = perform_text_summarization(text)
print ("Wordcount")
print(wcount)
print("\nSummary:")
print(summary)


Word Count:
python: 7
program: 11
crawl: 5
web: 15
page: 7
get: 8
frequent: 10
word: 23
geeksforgeek: 4
skip: 1
content: 7
coursesdata: 1
structur: 9
algorithmsdsa: 1
interview: 14
preparationdsa: 1
live: 14
work: 2
professionalsdsa: 1
self: 7
pace: 6
c: 5
javadsa: 1
pythondsa: 1
javascriptdsa: 1
cfor: 1
professionalsdata: 1
algorithm: 6
class: 48
system: 2
design: 7
devop: 2
data: 24
javascriptexplor: 1
coursesfor: 1
studentsinterview: 1
prepar: 4
coursedata: 3
scienc: 18
gate: 6
cs: 16
javascriptdata: 1
java: 2
pythonexplor: 1
coursesprogram: 1
languagesc: 1
beginn: 3
advancedjava: 1
advancedc: 1
advancedweb: 1
developmentful: 1
stack: 2
develop: 7
react: 3
node: 2
js: 2
backend: 4
android: 2
app: 3
kotlin: 2
django: 2
machin: 3
learn: 11
sciencecomplet: 1
master: 1
analyticsnew: 1
coursespython: 1
engin: 2
plan: 1
productionschool: 1
coursescbs: 1
comput: 2
scienceschool: 1
guideal: 1
coursestutorialsdsadata: 1
structuresarrayslink: 3
liststackqueuebinari: 3
treebinari: 3
search: 3
