### Q.1 Open the wikipedia page and scrapt the data: https://en.wikipedia.org/wiki/Saturn

In [29]:
import requests
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/wiki/Saturn'

headers = {
    'User-Agent': 'Custom-Script/1.9 (shivam@gmail.com)',
    'Accept-Language': 'en-US, en;q=0.5'
}

response = requests.get(url, headers = headers)
soup = BeautifulSoup(response.text, 'html.parser')

In [30]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from nltk import pos_tag

tokens = word_tokenize(soup.get_text())

stop_words = stopwords.words('english')
words = {token.lower() for token in tokens if token not in stop_words and token not in string.punctuation}

pos_tags = pos_tag(words)

In [3]:
from nltk import FreqDist

### 1. Find top 10 nouns from the page.

nouns = [word[0] for word in pos_tags if word[1].startswith('N')]

frequencies = FreqDist(nouns)
frequencies.most_common(10)

[('default', 1),
 ('r', 1),
 ('intelligence', 1),
 ('jr.', 1),
 ('phase', 1),
 ('result', 1),
 ('a8', 1),
 ('schmude', 1),
 ('night', 1),
 ('opposite', 1)]

In [4]:
from nltk.stem import WordNetLemmatizer
import re

# 2. Print all the unique verbs in their root form.

verbs = [word[0] for word in pos_tags if word[1].startswith('V') and re.match(r'^[a-z]+$', word[0])]

lemmatizer = WordNetLemmatizer()

lammatized_verbs = [lemmatizer.lemmatize(verb, pos='v') for verb in verbs]

print('Verbs in root form\n:', lammatized_verbs)

Verbs in root form
: ['galaxy', 'saw', 'lightning', 'ping', 'require', 'hilton', 'estimate', 'pass', 'include', 'doubleday', 'oblate', 'reveal', 'mccarthy', 'rotate', 'nemesis', 'light', 'incline', 'number', 'associate', 'dynamic', 'place', 'come', 'munsell', 'take', 'discover', 'mourn', 'assume', 'remain', 'carry', 'write', 'ganymede', 'extend', 'descend', 'do', 'encyclopedia', 'length', 'lovett', 'depict', 'occur', 'https', 'spectator', 'nbc', 'occult', 'celebrate', 'georgia', 'fifth', 'discoveries', 'stamatios', 'extend', 'leave', 'pass', 'unreported', 'lock', 'scale', 'strip', 'identify', 'round', 'model', 'twentieth', 'compute', 'date', 'publish', 'drag', 'cast', 'occur', 'find', 'ascend', 'take', 'charon', 'receive', 'complete', 'calypso', 'anthony', 'perform', 'roche', 'vary', 'hestroffer', 'observe', 'support', 'learner', 'suspect', 'surround', 'control', 'retrieve', 'be', 'deimos', 'note', 'gunter', 'languages', 'eclipse', 'intermix', 'form', 'make', 'synestia', 'california', 

In [5]:
from nltk import RegexpParser

# 3. Extract all the <DT><JJ><NN> phrases from the article.

rule = 'DTJJNN : {<DT><JJ><NN>}'

parser = RegexpParser(rule)

tree = parser.parse(pos_tags)

tree.draw()

In [45]:
from nltk import sent_tokenize
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.text_rank import TextRankSummarizer

# 4. Summarize the text using TextRank Algorithm

sentences = soup.text.split('\n')
sentences = [sentence for sentence in sentences if len(sentence.split(' ')) > 3]
sentences = '\n'.join(sentences)

sentences_count = len(sent_tokenize(sentences))

text_parser = PlaintextParser.from_string(sentences, Tokenizer('english'))

summarizer = TextRankSummarizer() 
summary = summarizer(text_parser.document, sentences_count = 3)

for sentence in summary:
    print(sentence)

The combination of the bulge and the rotation rate means that the effective surface gravity along the equator, 8.96 m/s2, is 74% of what it is at the poles and is lower than the surface gravity of Earth.
The Moon's orbit is inclined by several degrees relative to Saturn's, so occultations will only occur when Saturn is near one of the points in the sky where the two planes intersect (both the length of Saturn's year and the 18.6-Earth-year nodal precession period of the Moon's orbit influence the periodicity).
(Audio help · More spoken articles) Saturn overview by NASA's Science Mission Directorate Saturn fact sheet Archived 29 May 2018 at the Wayback Machine at the NASA Space Science Data Coordinated Archive Saturnian System terminology by the IAU Gazetteer of Planetary Nomenclature Cassini-Huygens legacy website by the Jet Propulsion Laboratory Interactive 3D gravity simulation of the Cronian system Archived 17 August 2020 at the Wayback Machine Solar eclipses on Saturn The Day the E

In [48]:
from sumy.summarizers.lsa import LsaSummarizer
# 5. Summarize the text using extractive summarization

lsa_summarizer = LsaSummarizer()
text_parser = PlaintextParser.from_string(sentences, Tokenizer('english'))
lsa_summary = lsa_summarizer(text_parser.document, sentences_count=3)

for sentence in lsa_summary:
    print(sentence)

[47][48][49] Saturn has a hot interior, reaching 11,700 °C (21,100 °F) at its core, and radiates 2.5 times more energy into space than it receives from the Sun.
[114] Saturn's moon Enceladus, which seems similar in chemical makeup to comets,[115] has often been regarded as a potential habitat for microbial life.
[160] In 1899, William Henry Pickering discovered Phoebe, a highly irregular satellite that does not rotate synchronously with Saturn as the larger moons do.
