# NLP Pipeline for Task Extraction and Categorization

This notebook demonstrates an NLP pipeline that extracts tasks from unstructured text and categorizes them using clustering and topic modeling. The pipeline includes:

1. **Preprocessing:** Clean and segment input text into sentences.
2. **Task Extraction:** Identify task sentences using heuristics and extract details such as performer and deadline.
3. **Clustering:** Compute sentence embeddings, then determine the optimal number of clusters using either the _Elbow Method_ or the _Silhouette Score_.
4. **Categorization:** Use LDA topic modeling on each cluster to derive a category label.

Follow the cells below to run the pipeline on your sample text (or upload your file).

In [None]:
# Install required packages
!pip install spacy sklearn gensim matplotlib
!python -m spacy download en_core_web_md

In [None]:
# Import required libraries and load the spaCy model
import json
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from gensim import corpora, models

# Load spaCy model with medium-sized vectors
nlp = spacy.load('en_core_web_md')

##############################################
# Utility Functions
##############################################

def load_text_from_file(filepath):
    """
    Read text from the provided file path.
    """
    try:
        with open(filepath, "r", encoding="utf-8") as file:
            text = file.read()
        return text.strip()
    except Exception as e:
        raise IOError(f"Error reading file '{filepath}': {e}")

def pretty_print_tasks(tasks, title="Tasks:"):
    """
    Pretty-print the list of task dictionaries in JSON format.
    """
    print(f"\n{title}")
    print(json.dumps(tasks, indent=4))

##############################################
# Preprocessing Functions
##############################################

def clean_text(text):
    """
    Clean the input text by stripping whitespace.
    """
    return text.strip()

##############################################
# Task Identification and Extraction
##############################################

def is_task_sentence(sentence):
    """
    Determine if a sentence likely represents a task.
    Uses two heuristics:
      1. Check if the sentence starts with a base form verb.
      2. Check for common task-related keywords (e.g., "has to", "please", "don't forget").
    """
    sent_text = sentence.text.strip()
    sent_lower = sent_text.lower()
    
    # Keywords that may indicate a task
    task_keywords = ['has to', 'need to', 'needs to', 'should', 'must', 'please', "don't forget"]
    
    # Heuristic 1: Check if the first token is a base form verb
    first_token = sentence[0]
    if first_token.pos_ == 'VERB' and first_token.tag_ == 'VB':
        return True
    
    # Heuristic 2: Look for task-related keywords
    for keyword in task_keywords:
        if keyword in sent_lower:
            return True
    
    return False

def extract_deadline(sentence):
    """
    Extract deadline information from a sentence by combining adjacent
    entities labeled as TIME or DATE.
    """
    deadline_tokens = []
    for ent in sentence.ents:
        if ent.label_ in ["TIME", "DATE"]:
            deadline_tokens.append(ent.text)
    if deadline_tokens:
        return " ".join(deadline_tokens)
    return None

def extract_task_details(sentence):
    """
    Extract details from a task sentence:
      - The full task text
      - The performer (first PERSON entity, if any)
      - The deadline (if any)
    """
    task_text = sentence.text.strip()
    performer = None
    
    # Find the first PERSON entity
    for ent in sentence.ents:
        if ent.label_ == "PERSON":
            performer = ent.text
            break
    
    deadline = extract_deadline(sentence)
    return task_text, performer, deadline

def process_text(text):
    """
    Process the input text:
      - Clean the text
      - Segment it into sentences
      - Extract sentences that likely represent tasks
    Returns a list of dictionaries containing task details.
    """
    cleaned_text = clean_text(text)
    doc = nlp(cleaned_text)
    tasks = []
    
    for sent in doc.sents:
        if is_task_sentence(sent):
            task_text, performer, deadline = extract_task_details(sent)
            tasks.append({
                "task": task_text,
                "performer": performer,
                "deadline": deadline
            })
    return tasks

##############################################
# Clustering and Categorization Functions
##############################################

def cluster_tasks(tasks, num_clusters):
    """
    Cluster tasks based on their sentence embeddings using KMeans.
    Adds a 'cluster' field to each task.
    Returns the updated tasks, the fitted KMeans model, and the task vectors.
    """
    task_vectors = []
    for task in tasks:
        doc = nlp(task["task"])
        task_vectors.append(doc.vector)
    X = np.array(task_vectors)
    
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(X)
    labels = kmeans.labels_
    
    for i, task in enumerate(tasks):
        task["cluster"] = int(labels[i])
    return tasks, kmeans, X

def label_clusters_with_lda(tasks, num_topics=1):
    """
    For each cluster, perform LDA topic modeling on the task texts to derive
    a category label from the dominant topic. Adds a 'category' field to each task.
    """
    # Group tasks by cluster
    cluster_tasks_dict = {}
    for task in tasks:
        cluster = task["cluster"]
        cluster_tasks_dict.setdefault(cluster, []).append(task["task"])
    
    cluster_labels = {}
    for cluster, sentences in cluster_tasks_dict.items():
        texts = []
        for sentence in sentences:
            doc = nlp(sentence)
            tokens = [
                token.lemma_.lower() 
                for token in doc 
                if token.is_alpha and token.text.lower() not in STOP_WORDS
            ]
            texts.append(tokens)
        
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        
        if len(dictionary) == 0:
            cluster_labels[cluster] = "General"
            continue
        
        lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=42)
        topic_terms = lda_model.show_topic(0, topn=3)
        label = " ".join([word for word, prob in topic_terms])
        cluster_labels[cluster] = label
    
    for task in tasks:
        task["category"] = cluster_labels[task["cluster"]]
    return tasks, cluster_labels

def determine_optimal_clusters_elbow(task_vectors, min_clusters=2, max_clusters=10):
    """
    Use the Elbow Method to display a plot of inertia for different cluster counts.
    Adjusts max_clusters if there are fewer samples.
    """
    n_samples = task_vectors.shape[0]
    
    if n_samples < min_clusters:
        print(f"Warning: Number of samples ({n_samples}) is less than the minimum clusters ({min_clusters}).")
        min_clusters = n_samples
    if n_samples < max_clusters:
        max_clusters = n_samples

    inertias = []
    cluster_range = range(min_clusters, max_clusters + 1)
    for k in cluster_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(task_vectors)
        inertias.append(kmeans.inertia_)

    plt.figure(figsize=(8, 4))
    plt.plot(list(cluster_range), inertias, marker="o")
    plt.title("Elbow Method For Optimal Clusters")
    plt.xlabel("Number of clusters")
    plt.ylabel("Inertia")
    plt.xticks(list(cluster_range))
    plt.grid(True)
    plt.show()

    print("Elbow Method Inertia Values:")
    for k, inertia in zip(cluster_range, inertias):
        print(f"Clusters: {k}, Inertia: {inertia}")

def determine_optimal_clusters_silhouette(task_vectors, min_clusters=2, max_clusters=10):
    """
    Compute the average silhouette score for different numbers of clusters
    and return the optimal number of clusters (with the highest score).
    Adjusts max_clusters to be at most n_samples - 1.
    """
    n_samples = task_vectors.shape[0]
    
    if n_samples < min_clusters:
        min_clusters = n_samples
    # Ensure max_clusters does not exceed n_samples - 1
    max_clusters = min(max_clusters, n_samples - 1)

    silhouette_scores = []
    cluster_range = range(min_clusters, max_clusters + 1)
    for k in cluster_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(task_vectors)
        score = silhouette_score(task_vectors, labels)
        silhouette_scores.append(score)
        print(f"Clusters: {k}, Silhouette Score: {score:.3f}")
    
    optimal_k = cluster_range[np.argmax(silhouette_scores)]
    print(f"\nOptimal number of clusters based on silhouette score: {optimal_k}")
    return optimal_k


## Running the Pipeline

In the cell below, we define the input text (or you can upload a file) and run the complete pipeline:

1. **Task Extraction:** Process the input text to extract tasks.
2. **Vectorization:** Compute sentence embeddings for each extracted task.
3. **Optimal Cluster Determination:** Use the Silhouette Score (or Elbow Method) to choose the number of clusters.
4. **Clustering and Categorization:** Cluster the tasks and assign category labels using LDA.
5. **Output:** Display the extracted tasks along with their cluster and category information.


In [None]:
# Define the input text (you can also upload a file using Google Colab's file uploader)
input_text = """
Rahul wakes up early every day. He goes to college in the morning and comes back at 3 pm. At present, Rahul is outside. He has to buy the snacks for all of us. Please review the document by tomorrow. John should clean the room by 5 pm today. Don't forget to call Sarah. Mark must submit the report by next Monday.
"""

# Uncomment the lines below to upload a file (if preferred) in Google Colab
# from google.colab import files
# uploaded = files.upload()
# for filename in uploaded.keys():
#     input_text = load_text_from_file(filename)

# Process the input text to extract tasks
tasks = process_text(input_text)
pretty_print_tasks(tasks, title="Extracted Tasks:")

if not tasks:
    print("No tasks found in the input.")
else:
    # Compute task vectors for clustering
    task_vectors = []
    for task in tasks:
        doc = nlp(task["task"])
        task_vectors.append(doc.vector)
    task_vectors = np.array(task_vectors)

    # Choose the method to determine the number of clusters: 'elbow' or 'silhouette'
    method = "silhouette"  # Change to "elbow" if desired

    if method == "elbow":
        print("\nDetermining the optimal number of clusters using the Elbow Method...")
        determine_optimal_clusters_elbow(task_vectors, min_clusters=2, max_clusters=10)
        # Manually enter the desired number of clusters after reviewing the plot
        num_clusters = int(input("Based on the elbow plot, enter the desired number of clusters: "))
    else:
        print("\nDetermining the optimal number of clusters using the Silhouette Score...")
        num_clusters = determine_optimal_clusters_silhouette(task_vectors, min_clusters=2, max_clusters=10)

    # Cluster the tasks and assign category labels using LDA
    tasks, kmeans, _ = cluster_tasks(tasks, num_clusters)
    tasks, cluster_labels = label_clusters_with_lda(tasks, num_topics=1)

    pretty_print_tasks(tasks, title="Tasks with Categories:")
