In [2]:
pip install tensorflow-datasets

Note: you may need to restart the kernel to use updated packages.


In [4]:
import requests
from collections import Counter
import re

def load_and_preprocess_shakespeare():
    """
    Downloads and preprocesses the Shakespeare dataset:
    - Downloads text from a public URL
    - Converts to lowercase
    - Removes punctuation
    - Splits text into individual words

    Returns:
    - List of words in the dataset.
    """
    # Download the Shakespeare dataset
    url = "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt"
    response = requests.get(url)
    text = response.text

    # Preprocess the text: lowercase and remove punctuation
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation

    # Split into words
    words = text.split()
    
    return words

def get_top_n_words(words, n):
    """
    Finds the top 'n' most frequent words in a list.

    Parameters:
    - words: List of words to analyze.
    - n: Number of top frequent words to return.

    Returns:
    - List of tuples with the top 'n' words and their counts.
    """
    # Count word frequencies
    word_counts = Counter(words)
    
    # Get the n most common words
    return word_counts.most_common(n)

# Example usage
words = load_and_preprocess_shakespeare()  # Load and preprocess the dataset
top_n_words = get_top_n_words(words, n=10)  # Find the top 10 most frequent words
print("Top 10 most frequent words:", top_n_words)

Top 10 most frequent words: [('the', 6283), ('and', 5680), ('to', 4766), ('i', 4653), ('of', 3757), ('you', 3142), ('my', 3118), ('a', 2987), ('that', 2569), ('in', 2362)]
