In [21]:
# Importing all necessary libraries and packages
import re
import nltk
import spacy
import gensim
import string
import numpy as np
from gensim.models import Word2Vec
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords

In [23]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pavan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pavan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\pavan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\pavan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [25]:
# Loading Spacy model(small model) for Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_sm")

In [27]:
# Function to preprocessing the text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    return " ".join(words)

In [29]:
# Function to identify tasks in given text
def extract_tasks(text):
    sentences = sent_tokenize(text)
    tasks = []
    for sent in sentences:
        words = word_tokenize(sent)
        pos_tags = pos_tag(words)
        
        # Heuristic: If sentence contains modal verbs like 'must', 'should', 'has to'
        if any(word in sent.lower() for word in ['must', 'should', 'has to', 'needs to', 'required to']):
            tasks.append(sent)
        
        # Heuristic: If sentence starts with a verb in imperative form
        if pos_tags and pos_tags[0][1] in ['VB', 'VBP']:
            tasks.append(sent)
    
    return tasks

In [31]:
# Function to extract entities from text like who has to do the task and deadline
def extract_entities(sentence):
    doc = nlp(sentence)
    person = None
    deadline = None
    
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            person = ent.text
        if ent.label_ in ["TIME", "DATE"]:
            deadline = ent.text
    
    return person, deadline

In [33]:
# Function to categorize tasks using Topic Modeling
def categorize_tasks(tasks):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(tasks)
    lda = LatentDirichletAllocation(n_components=3, random_state=42)  # 3 categories
    lda.fit(X)
    categories = lda.transform(X).argmax(axis=1)
    
    category_names = {0: 'Personal Tasks', 1: 'Work Tasks', 2: 'Miscellaneous'}
    categorized_tasks = [(tasks[i], category_names[categories[i]]) for i in range(len(tasks))]
    
    return categorized_tasks

In [35]:
# Main function to process text and generate the required output
def process_text(text):
    clean_text = preprocess_text(text)
    extracted_tasks = extract_tasks(text)
    categorized_tasks = categorize_tasks(extracted_tasks)
    
    structured_output = []
    for task, category in categorized_tasks:
        person, deadline = extract_entities(task)
        structured_output.append({
            "task": task,
            "category": category,
            "assigned_to": person if person else "Unknown",
            "deadline": deadline if deadline else "No deadline mentioned"
        })
    
    return structured_output

In [37]:
# Sample input text for testing
text = """
Aisha needs to submit the assignment by Friday. 
Michael should call the client before noon. 
The team must complete the project presentation by next Monday.
"""

# Printing the desired output
output = process_text(text)
for item in output:
    print(item)

{'task': '\nAisha needs to submit the assignment by Friday.', 'category': 'Miscellaneous', 'assigned_to': 'Aisha', 'deadline': 'Friday'}
{'task': 'Michael should call the client before noon.', 'category': 'Work Tasks', 'assigned_to': 'Michael', 'deadline': 'noon'}
{'task': 'The team must complete the project presentation by next Monday.', 'category': 'Work Tasks', 'assigned_to': 'Unknown', 'deadline': 'next Monday'}
