# Task Extraction and Categorization using NLP

This notebook processes text to extract and categorize task-related information using Natural Language Processing (NLP) techniques.

### Steps Included:
1. **Preprocessing** - Cleaning text by removing punctuation and stop words.
2. **Tokenization & POS Tagging** - Splitting text into sentences and tagging parts of speech.
3. **Task Identification** - Extracting task-related sentences.
4. **Task Categorization** - Grouping extracted tasks using NLP models.
5. **Extracting Task Details** - Identifying responsible persons and deadlines.

In [None]:
import re
import nltk
import string
import gensim
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from gensim.models import Word2Vec
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

# Downloading necessary NLP resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [None]:
"""
This function cleans the input text by:
1. Converting it to lowercase.
2. Removing punctuation.
3. Removing stop words.
"""

def preprocess_text(text):
    """I clean the input text by removing punctuation and stop words."""
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    cleaned_text = " ".join([word for word in words if word not in stop_words])  # Remove stop words
    return cleaned_text, words

In [None]:
def tokenize_and_tag(text):
    """I split the text into sentences and perform part-of-speech (POS) tagging."""
    sentences = sent_tokenize(text)  # Split text into sentences
    tagged_sentences = [pos_tag(word_tokenize(sentence)) for sentence in sentences]  # POS tagging
    return tagged_sentences

In [None]:
def identify_tasks(text):
    """I identify sentences that likely represent tasks based on heuristic rules."""
    sentences = sent_tokenize(text)
    task_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence)
        pos_tags = pos_tag(words)

        # Heuristics for identifying tasks
        action_verbs = [word for word, tag in pos_tags if tag.startswith("VB")]
        if "has" in words or "must" in words or "should" in words or action_verbs:
            task_sentences.append(sentence)

    return task_sentences

In [None]:
def categorize_tasks(task_sentences):
    """I categorize extracted tasks using Word2Vec and Latent Dirichlet Allocation (LDA)."""
    tokenized_tasks = [word_tokenize(sentence) for sentence in task_sentences]

    # Train Word2Vec model
    word2vec_model = Word2Vec(tokenized_tasks, vector_size=100, window=5, min_count=1, workers=4)

    # Define useful categories dynamically using LDA topic modeling
    dictionary = Dictionary(tokenized_tasks)
    corpus = [dictionary.doc2bow(text) for text in tokenized_tasks]
    lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=10)

    categorized_tasks = {}
    for sentence in task_sentences:
        bow_vector = dictionary.doc2bow(word_tokenize(sentence))
        topic = max(lda_model[bow_vector], key=lambda x: x[1])[0]  # Get dominant topic
        categorized_tasks[sentence] = f"Category {topic+1}"

    return categorized_tasks

In [None]:
"""
This function extracts details from task-related sentences, such as:
1. Responsible person.
2. Deadline.
"""
def extract_task_details(task_sentences):
    """I extract the responsible person and deadline from task sentences."""
    task_details = []
    for sentence in task_sentences:
        words = word_tokenize(sentence)
        pos_tags = pos_tag(words)

        person = None
        deadline = None

        for i, (word, tag) in enumerate(pos_tags):
            if tag == "NNP":  # Proper noun (Assuming it is a person's name)
                person = word
            if word in ["by", "before", "on"] and i + 1 < len(words):  # Identify deadline phrases
                deadline = words[i + 1: i + 3]  # Taking two words after it
                deadline = " ".join(deadline)

        task_details.append({
            "task": sentence,
            "person": person if person else "Unknown",
            "deadline": deadline if deadline else "No deadline specified"
        })

    return task_details

In [None]:
"""
This function cleans the input text by:
1. Converting it to lowercase.
2. Removing punctuation.
3. Removing stop words.
"""
user_input = input("Enter text: ")
cleaned_text, words = preprocess_text(user_input)
tagged_sentences = tokenize_and_tag(user_input)
task_sentences = identify_tasks(user_input)
categorized_tasks = categorize_tasks(task_sentences)
task_details = extract_task_details(task_sentences)

# Output structured results
print("\n--- Cleaned Text ---\n", cleaned_text)
print("\n--- POS Tagged Sentences ---\n", tagged_sentences)
print("\n--- Identified Tasks ---\n", task_sentences)
print("\n--- Categorized Tasks ---")
for task, category in categorized_tasks.items():
    print(f"{task}: {category}")

print("\n--- Structured Task List ---")
for task in task_details:
    print(f"Task: {task['task']}, Assigned To: {task['person']}, Deadline: {task['deadline']}")

Enter text: Rahul wakes up early every day. He goes to college in the morning and comes back at 3 pm. At present, Rahul is outside. He has to buy the snacks for all of us.

--- Cleaned Text ---
 rahul wakes early every day goes college morning comes back 3 pm present rahul outside buy snacks us

--- POS Tagged Sentences ---
 [[('Rahul', 'NNP'), ('wakes', 'VBZ'), ('up', 'RP'), ('early', 'JJ'), ('every', 'DT'), ('day', 'NN'), ('.', '.')], [('He', 'PRP'), ('goes', 'VBZ'), ('to', 'TO'), ('college', 'NN'), ('in', 'IN'), ('the', 'DT'), ('morning', 'NN'), ('and', 'CC'), ('comes', 'VBZ'), ('back', 'RB'), ('at', 'IN'), ('3', 'CD'), ('pm', 'NN'), ('.', '.')], [('At', 'IN'), ('present', 'JJ'), (',', ','), ('Rahul', 'NNP'), ('is', 'VBZ'), ('outside', 'JJ'), ('.', '.')], [('He', 'PRP'), ('has', 'VBZ'), ('to', 'TO'), ('buy', 'VB'), ('the', 'DT'), ('snacks', 'NNS'), ('for', 'IN'), ('all', 'DT'), ('of', 'IN'), ('us', 'PRP'), ('.', '.')]]

--- Identified Tasks ---
 ['Rahul wakes up early every day.', '