In [None]:
!pip install nltk



In [None]:
import nltk
import heapq
import string
from collections import Counter
import os # For checking paths if needed

# --- Robust NLTK Resource Download and Verification ---
def download_and_verify_nltk_resources():
    resources_to_check = {
        'punkt': 'tokenizers/punkt',
        'stopwords': 'corpora/stopwords',
        'punkt_tab': 'tokenizers/punkt_tab'  # Added this line based on the new error
    }
    all_verified = True

    print("--- Checking and Downloading NLTK Resources ---")
    # Ensure the default NLTK data path is in the search list (usually /root/nltk_data in Colab)
    # default_nltk_path = os.path.join(os.path.expanduser("~"), "nltk_data")
    # if default_nltk_path not in nltk.data.path:
    #     nltk.data.path.append(default_nltk_path)


    for resource_name, resource_path_suffix in resources_to_check.items():
        try:
            nltk.data.find(resource_path_suffix)
            print(f"NLTK resource '{resource_name}' (path: '{resource_path_suffix}') already found.")
        except LookupError:
            print(f"NLTK resource '{resource_name}' (path: '{resource_path_suffix}') not found. Attempting download...")
            try:
                # Make download verbose to see any issues
                # nltk.download() returns True on success, False or raises an exception on failure.
                if nltk.download(resource_name, quiet=False): # quiet=False for more output
                    print(f"nltk.download('{resource_name}') reported completion.")
                    # VERIFY IMMEDIATELY
                    try:
                        nltk.data.find(resource_path_suffix)
                        print(f"SUCCESS: '{resource_name}' downloaded and verified by nltk.data.find().")
                    except LookupError:
                        print(f"CRITICAL ERROR: '{resource_name}' download reported completion, BUT nltk.data.find() STILL CANNOT LOCATE IT.")
                        print(f"NLTK is searching for '{resource_path_suffix}' in these paths: {nltk.data.path}")
                        print("This can happen if download was incomplete or NLTK's path list is not updated.")
                        all_verified = False
                else:
                    print(f"nltk.download('{resource_name}') returned False or failed. Resource may not be available.")
                    all_verified = False
            except Exception as e:
                print(f"An exception occurred during download or verification of '{resource_name}': {e}")
                all_verified = False

    if all_verified:
        print("--- All necessary NLTK resources seem to be available. ---")
    else:
        print("--- CRITICAL FAILURE: One or more NLTK resources could not be made available. ---")
        print("The script cannot proceed reliably.")
        print("Recommendations:")
        print("1. Runtime > Restart Runtime. Then run this cell again.")
        print("2. Check your internet connection.")
        print(f"NLTK's current data search paths: {nltk.data.path}")
    return all_verified

# --- Call the download and verification function at the start ---
NLTK_RESOURCES_READY = download_and_verify_nltk_resources()


# --- Functions --- (Only define them; they will be called if NLTK_RESOURCES_READY)

def get_paragraph_colab():
    """Prompts the user to enter a paragraph in a Colab environment."""
    print("\nPlease paste or type your paragraph below. When finished, press Enter.")
    print("If pasting multiple lines, Colab's input() will handle it as one line break.")
    print("To submit, just press Enter after your text.")
    paragraph = input("Enter your paragraph: ")
    return paragraph

def generate_overview(text, num_sentences=3):
    """Generates a simple extractive summary."""
    if not NLTK_RESOURCES_READY:
        return "NLTK resources not ready. Cannot generate overview."
    if not text or len(text.split()) < 10:
         return "Input text is too short for a meaningful overview."

    sentences = nltk.sent_tokenize(text) # Relies on 'punkt' (and its dependencies like 'punkt_tab')
    if len(sentences) <= num_sentences:
        return text

    stop_words = set(nltk.corpus.stopwords.words('english') + list(string.punctuation)) # Relies on 'stopwords'
    words = [word.lower() for word in nltk.word_tokenize(text) if word.lower() not in stop_words and word.isalnum()] # Relies on 'punkt' (and its dependencies)

    if not words:
        return "No content words found after removing stop words and punctuation."

    word_freq = Counter(words)
    sentence_scores = {}
    for sentence in sentences:
        sentence_words = [word.lower() for word in nltk.word_tokenize(sentence) if word.lower() not in stop_words and word.isalnum()]
        score = sum(word_freq[word] for word in sentence_words)
        if len(sentence_words) > 0:
             sentence_scores[sentence] = score / len(sentence_words)
        else:
             sentence_scores[sentence] = 0

    summary_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    overview = " ".join(sorted(summary_sentences, key=lambda s: sentences.index(s)))
    return overview

def extract_keywords(text, num_keywords=5):
    """Extracts the most frequent non-stop words as keywords."""
    if not NLTK_RESOURCES_READY:
        return []
    if not text:
        return []

    stop_words = set(nltk.corpus.stopwords.words('english') + list(string.punctuation)) # Relies on 'stopwords'
    words = [word.lower() for word in nltk.word_tokenize(text) if word.lower() not in stop_words and word.isalnum()] # Relies on 'punkt' (and its dependencies)

    if not words:
        return []

    word_freq = Counter(words)
    keywords = [word for word, freq in word_freq.most_common(num_keywords)]
    return keywords

# --- Main Execution ---
if NLTK_RESOURCES_READY:
    paragraph = get_paragraph_colab()

    if not paragraph:
        print("\nNo input received. Exiting.")
    else:
        print("\n--- Processing ---")
        try:
            overview_text = generate_overview(paragraph, num_sentences=3)
            keyword_list = extract_keywords(paragraph, num_keywords=7)

            print("\n--- Overview ---")
            print(overview_text)

            print("\n--- Keywords ---")
            if keyword_list:
                print(", ".join(keyword_list))
            else:
                print("Could not extract keywords (perhaps the text was too short or only contained stop words).")

        except LookupError as le:
            print(f"\nUNEXPECTED NLTK LOOKUP ERROR DURING PROCESSING: {le}")
            print("This indicates a required NLTK resource was still not found, despite earlier checks.")
            print("This is highly unusual if the initial checks passed and all listed resources were verified.")
            print(f"NLTK search paths for data: {nltk.data.path}")
            print("Please try restarting the Colab Runtime (Runtime > Restart Runtime) and run the cell again.")
            print("If the error persists for a specific resource (e.g., 'punkt_tab'), ensure it was listed in 'resources_to_check' at the top of the script.")
        except Exception as e:
            print(f"\nAN UNEXPECTED ERROR OCCURRED DURING PROCESSING: {e}")
else:
    print("\nScript cannot proceed because NLTK resources were not properly downloaded or verified.")
    print("Please review the download messages above.")
    print("Try: 1. Check internet. 2. Runtime > Restart Runtime. 3. Run the cell again.")

--- Checking and Downloading NLTK Resources ---
NLTK resource 'punkt' (path: 'tokenizers/punkt') already found.
NLTK resource 'stopwords' (path: 'corpora/stopwords') already found.
NLTK resource 'punkt_tab' (path: 'tokenizers/punkt_tab') already found.
--- All necessary NLTK resources seem to be available. ---

Please paste or type your paragraph below. When finished, press Enter.
If pasting multiple lines, Colab's input() will handle it as one line break.
To submit, just press Enter after your text.
Enter your paragraph: The Spark SQL Engine is a core component that drives DataFrames and DataSets, with an emphasis on being SQL-first. It is responsible for connecting to the Apache Hive metastore to manage tables and for reading and writing data using various formats, converting them into tables. Spark SQL also provides a bridge to SQL data via JDBC/ODBC. Crucially, it generates optimized query plans and compact bytecode.

--- Processing ---

--- Overview ---
The Spark SQL Engine is a c