In [46]:
import pandas as pd
import re
import nltk
import time
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

try:
    import textstat
except ImportError:
    print("Warning: 'textstat' library not found. Please install using 'pip install textstat'")
    textstat = None

print("Downloading NLTK data...")
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')


def calculate_readability(text):
    """
    Final Step: Check readability score using Flesch-Kincaid Grade Level.
    We apply this on the RAW text (before removing punctuation)
    to ensure the sentence count is accurate.
    """
    if textstat:
        return textstat.flesch_kincaid_grade(text)
    return 0.0

def nlp_pipeline(text):
    """
    Complete NLP Pipeline following Slide Page 42 & 50:
    1. Basic Cleaning (HTML)
    2. Sentence Tokenization
    3. Remove Special Chars (Regex)
    4. Word Tokenization
    5. Lowercasing
    6. Stop word removal
    7. Lemmatization
    """

    text = re.sub(r'<.*?>', ' ', text)

    sentences = sent_tokenize(text)

    processed_tokens = []

    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    for sentence in sentences:
        sentence_clean = re.sub(r'[^a-zA-Z\s]', '', sentence)
        words = word_tokenize(sentence_clean)

        for word in words:
            lower_word = word.lower()

            if lower_word not in stop_words:
                lemma_word = lemmatizer.lemmatize(lower_word)

                if len(lemma_word) > 0:
                    processed_tokens.append(lemma_word)

    return processed_tokens

def main():
    print("\n" + "="*50)
    print("STARTING WORKSHOP PIPELINE")
    print("="*50)

    print("Loading FULL dataset...")
    try:
        df = pd.read_csv('IMDB Dataset.csv')
        print(f"Data loaded successfully: {len(df)} rows found.")
    except FileNotFoundError:
        print("Error: File 'IMDB Dataset.csv' not found. Please check the file path.")
        return

    print("Processing...")
    start_time = time.time()

    if textstat:
        print("Calculating Readability Scores (Flesch-Kincaid)...")
        df['readability_score'] = df['review'].apply(calculate_readability)
        avg_grade = df['readability_score'].mean()
    else:
        avg_grade = "N/A (Library not installed)"

    print("Running NLP Pipeline (Cleaning -> Tokenizing -> Lemmatizing)...")
    df['processed_tokens'] = df['review'].apply(nlp_pipeline)

    end_time = time.time()
    runtime = end_time - start_time

    print("\n" + "="*60)
    print("WORKSHOP REPORT (Week 4)")
    print("="*60)

    print(f"\n[1] Average Flesch-Kincaid Grade Level: {avg_grade}")
    print("    (Note: Calculated on original text. Normal range for general text is 8-12)")

    print("\n[2] Regex Explanation (Step 1)")
    print("    Regex Used: r'[^a-zA-Z\\s]'")
    print("    Explanation:")
    print("      1. [ ... ] : Defines a Character Set.")
    print("      2. ^       : Inside brackets, it means 'Negation' (NOT).")
    print("      3. a-zA-Z  : Matches all English alphabets (lowercase and uppercase).")
    print("      4. \\s      : Matches whitespace characters (spaces, tabs, newlines).")
    print("    Logic Translation:")
    print("      \"Find any single character that is NOT an alphabet AND NOT a space,")
    print("       then replace it with an empty string (delete it).\"")

    print("\n[3] Sample Comparison (First Row)")
    print(f"    Original Text : {df['review'].iloc[0][:80]}...")
    print(f"    Final Tokens  : {df['processed_tokens'].iloc[0][:15]}")

    print("\n" + "="*60)
    print(f"Total Processing Runtime: {runtime:.2f} seconds ({runtime/60:.2f} minutes)")
    print("="*60)

if __name__ == "__main__":
    main()

Downloading NLTK data...


[nltk_data] Downloading package punkt to C:\Users\Nitro
[nltk_data]     V15\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Nitro
[nltk_data]     V15\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Nitro
[nltk_data]     V15\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Nitro
[nltk_data]     V15\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Nitro
[nltk_data]     V15\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!



STARTING WORKSHOP PIPELINE
Loading FULL dataset...
Data loaded successfully: 50000 rows found.
Processing...
Calculating Readability Scores (Flesch-Kincaid)...
Running NLP Pipeline (Cleaning -> Tokenizing -> Lemmatizing)...

WORKSHOP REPORT (Week 4)

[1] Average Flesch-Kincaid Grade Level: 9.015685525721471
    (Note: Calculated on original text. Normal range for general text is 8-12)

[2] Regex Explanation (Step 1)
    Regex Used: r'[^a-zA-Z\s]'
    Explanation:
      1. [ ... ] : Defines a Character Set.
      2. ^       : Inside brackets, it means 'Negation' (NOT).
      3. a-zA-Z  : Matches all English alphabets (lowercase and uppercase).
      4. \s      : Matches whitespace characters (spaces, tabs, newlines).
    Logic Translation:
      "Find any single character that is NOT an alphabet AND NOT a space,
       then replace it with an empty string (delete it)."

[3] Sample Comparison (First Row)
    Original Text : One of the other reviewers has mentioned that after watching ju