In [93]:
# Necessary Python Packages. 
import requests
import numpy as np
import pandas as pd
from urllib.request import urlopen
import re
import unicodedata
import json
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")

In [94]:
def find_common_words_from_wiki_page(n: int, page_id: int)->dict:
    """
    The function takes a page id of wiki and return the n top words with title. 
    
    Input Parms:
    n (int): number of top words.
    page_id (int): page id. 
    
    Returns:
    web_scrapped_list (dict): the dict containing all top n word count and Title.
    """
    
    # Building the URL. 
    URL = f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts&pageids={page_id}&explaintext&format=json"
    raw_text = requests.get(URL).text
    
    # Getting the title of the page. 
    string_to_json = json.loads(raw_text)
    title_of_wiki_page = string_to_json['query']['pages']['21721040']['title']
    
    # preprocessing of the raw text. 
    # step1: Removing the URLs if present.
    processed_text = re.sub(r'https?://\S+|www\.\S+', '', raw_text)
    
    # step2: Removing the acceted characters. 
    processed_text = unicodedata.normalize('NFKD', processed_text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # step3: Removing the punctutation from the string.
    processed_text = re.sub(r'[^a-zA-Z0-9]', ' ', processed_text)
    
    # step4: Removing the numbers and special characters.
    processed_text = re.sub(r'[^a-zA-Z]', ' ', processed_text)
    
    # step5: Removing the white space from the string. 
    processed_text = re.sub(r'^\s*|\s\s*', ' ', processed_text).strip()
    
    # step6: Removing the stop words.
    stp_words = stopwords.words("english")
    pattern = re.compile(r'\b(' + r'|'.join(stp_words) + r')\b\s*')
    processed_text = pattern.sub('', processed_text)
    
    # step7: Removing single letters.
    processed_text = re.sub('(\\b[A-Za-z] \\b|\\b [A-Za-z]\\b)', '', processed_text)
    
    # split() returns list of all the words in the string
    word_split = processed_text.split()

    # Pass the split_it list to instance of Counter class.
    count_of_words = Counter(word_split)

    # most_common() produces l frequently encountered
    most_occur = Counters_found.most_common(n)
    
    # dict to hold the results.
    final_output = defaultdict(list)
    for word in most_occur:
        final_output[word[1]].append(word[0])
    final_output = dict(final_output)
    final_output['Title'] = title_of_wiki_page
    
    # returning the final dict.
    return final_output

In [95]:
# Function call and printing the results. 
final_dict = find_common_words_from_wiki_page(10, 21721040)
print(final_dict)

{19: ['questions'], 17: ['Overflow'], 16: ['Stack'], 11: ['users'], 10: ['question'], 8: ['answer', 'site'], 7: ['website', 'answers'], 6: ['The'], 'Title': 'Stack Overflow'}
