In [29]:
import concurrent.futures
import re
import requests
import nltk
from nltk import tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import time

MAX_WORKERS = 16

def get_wikipedia_article(wiki_article, timeout=10):
    '''
    Method that gets a Wikipedia page.
    '''
    wiki_page_url = 'https://en.wikipedia.org/wiki/' + wiki_article
    response = requests.get(url=wiki_page_url, timeout=timeout)
    article = response.text
    return article

def get_wikipedia_articles(wiki_articles):
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(get_wikipedia_article, wiki_article=article) for article in wiki_articles]
        articles = [future.result() for future in concurrent.futures.as_completed(futures)]
    return articles

# Functions to clean HTML, punctuation marks and symbols from the original articles
def clean_html(raw_html):
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, ' ', raw_html)
    return cleantext

def clean_punctuation_marks(text):
    return re.sub(r'[^\w\s]', ' ', text)

def clean_return_and_tabs(text):
    return re.sub(r'[\n\t]', ' ', text)

def clean_article(data):
    cdata = clean_html(data)
    cdata = clean_punctuation_marks(cdata)
    cdata = clean_return_and_tabs(cdata)
    return cdata

def remove_stopwords(document, stopwords):
    return [word for word in document if word not in stopwords]

def remove_numbers(document):
    return [word for word in document if not any(char.isdigit() for char in word)]

def remove_wiki_and_wg_words(document):
    return [word for word in document if not word.startswith('wg') and not word.startswith('wiki')]

def process_article(article, stopwords, stemmer):
    article = article.lower()
    article = clean_article(article)
    tokens = tokenize.word_tokenize(article)
    tokens = remove_stopwords(tokens, stopwords)
    tokens = list(map(stemmer.stem, tokens))
    tokens = remove_numbers(tokens)
    tokens = remove_wiki_and_wg_words(tokens)
    return tokens

def map_phase(wiki_articles, stopwords, stemmer):
    articles = get_wikipedia_articles(wiki_articles)
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        processed_articles = list(executor.map(lambda article: process_article(article, stopwords, stemmer), articles))
    return processed_articles

def reduce_phase(processed_articles):
    # Combine all processed words into a single list
    all_words = [word for article in processed_articles for word in article]

    # Convert the list of words into space-separated strings for TfidfVectorizer
    corpus = [' '.join(article) for article in processed_articles]

    # Initialize TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Compute TF-IDF scores
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

    # Get feature names (words)
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Create a dictionary to store top 20 keywords for each article
    top_keywords = {}

    # Iterate over each article
    for i, article in enumerate(processed_articles):
        # Get the TF-IDF scores for the current article
        article_tfidf_scores = tfidf_matrix[i].toarray()[0]

        # Create a dictionary to store TF-IDF scores for each word
        word_tfidf_scores = {word: score for word, score in zip(feature_names, article_tfidf_scores)}

        # Sort words by their TF-IDF scores in descending order
        sorted_words = sorted(word_tfidf_scores.items(), key=lambda x: x[1], reverse=True)

        # Extract top 20 keywords
        top_keywords[f'Article_{i+1}'] = [word for word, score in sorted_words[:20]]

    return top_keywords, tfidf_matrix

def calculate_similarity(tfidf_matrix):
    # Calculate cosine similarity between all articles
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix

# Main workflow
if __name__ == "__main__":
    wiki_articles = ['lion', 'beetle', 'tiger']

    stopwords = set(['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during',
                     'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours',
                     'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as',
                     'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your',
                     'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should',
                     'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when',
                     'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does',
                     'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not',
                     'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which',
                     'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by',
                     'doing', 'it', 'how', 'further', 'was', 'here', 'than', '_s', '_j', '_m', '_a'])

    stemmer = SnowballStemmer('english')

    start = time.perf_counter()

    # Map phase
    processed_articles = map_phase(wiki_articles, stopwords, stemmer)

    # Reduce phase
    top_keywords, tfidf_matrix = reduce_phase(processed_articles)

    # Calculate similarity
    similarity_matrix = calculate_similarity(tfidf_matrix)

    stop = time.perf_counter()
    print('The process took', stop - start, 'seconds')
    print('Top keywords for each article:', top_keywords)
    print('Similarity matrix:\n', similarity_matrix)


The process took 1.5914746999987983 seconds
Top keywords for each article: {'Article_1': ['lion', 'mw', 'parser', 'output', 'toc', 'vector', 'panthera', 'africa', 'doi', 'pride', 'leo', 'nation', 'cat', 'list', 'li', 'male', 'origin', 'archiv', 'id', 'african'], 'Article_2': ['tiger', 'mw', 'parser', 'output', 'doi', 'tigri', 'panthera', 'toc', 'vector', 'cat', 'conserv', 'popul', 'li', 'rang', 'speci', 'list', 'hlist', 'thapar', 'pp', 'mi'], 'Article_3': ['beetl', 'mw', 'parser', 'insect', 'output', 'toc', 'vector', 'coleoptera', 'speci', 'doi', 'li', 'id', 'larva', 'list', 'class', 'level', 'edit', 'use', 'item', 'hlist']}
Similarity matrix:
 [[1.         0.46680648 0.25694226]
 [0.46680648 1.         0.2507467 ]
 [0.25694226 0.2507467  1.        ]]
