In [1]:
text = input("Enter the text : ") 


# installing the requirements

In [2]:
import nltk
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')

import spacy
#spacy.cli.download("en_core_web_lg")


# Preprocessing

In [3]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string
from nltk.tokenize import sent_tokenize

# Load English language model
nlp = spacy.load("en_core_web_lg")


def preprocess_text(text):
    
    # Convert text to lowercase
    text = text.lower()
    cleaned_text = " "
    pos_tags_str = " "
    
    # Process text using spaCy
    sentences = sent_tokenize(text)
    for sentence in sentences:
        doc = nlp(sentence)
        # Remove stop words and punctuation
        cleaned_tokens = [token.text for token in doc if token.text not in string.punctuation]
    
        # Perform POS tagging
        pos_tags = [f"{token.text}({token.pos_})" for token in doc if token.text not in string.punctuation]
    
        # Convert lists to strings
        cleaned_text += str(cleaned_tokens)
        pos_tags_str += str(pos_tags)
    
    return cleaned_text, pos_tags_str

# Preprocess text
cleaned_text, pos_tags_str = preprocess_text(text)

# Print results
print("Cleaned Text:", cleaned_text)
print("POS Tags:", pos_tags_str)


Cleaned Text:  ['rahul', 'should', 'clean', 'his', 'room', 'by', '5', 'pm', 'today']
POS Tags:  ['rahul(PROPN)', 'should(AUX)', 'clean(VERB)', 'his(PRON)', 'room(NOUN)', 'by(ADP)', '5(NUM)', 'pm(NOUN)', 'today(NOUN)']


In [4]:
from spacy import tokens

# Load spaCy's English model
nlp = spacy.load("en_core_web_lg")

# Process the text
doc = nlp(text)
print(type(doc))
# Tokenization and POS tagging
for token in doc:
    print(f"Word: {token}, POS Tag: {token.pos_}")


<class 'spacy.tokens.doc.Doc'>
Word: Rahul, POS Tag: PROPN
Word: should, POS Tag: AUX
Word: clean, POS Tag: VERB
Word: his, POS Tag: PRON
Word: room, POS Tag: NOUN
Word: by, POS Tag: ADP
Word: 5, POS Tag: NUM
Word: pm, POS Tag: NOUN
Word: today, POS Tag: NOUN
Word: ., POS Tag: PUNCT


# Extracting and identifying tasks

In [5]:
from nltk.tokenize import sent_tokenize, word_tokenize
def identify_tasks(sentences):
    tasks = []
    modal_verbs = {"should", "must", "have to", "needs to", "will", "can", "could", "may", "might", "ought to"}
    sentences = sent_tokenize(sentences)
    for sentence in sentences:
        # Tokenize and process each sentence
        doc = nlp(sentence)
        print(doc)
        verbs = []
        
        # Manually check for verb phrases (e.g., "needs to", "have to")
        words = sentence.lower().split()
        
        # Check for specific two-word phrases like "needs to", "have to"
        for i in range(len(words) - 1):
            if words[i] == "needs" and words[i+1] == "to":
                verbs.append("needs to")
            if words[i] == "have" and words[i+1] == "to":
                verbs.append("have to")
        
        # If no verb phrase was found, check for regular verbs in the sentence
        if not verbs:
            verbs = [token for token in doc if token.pos_ == "VERB" or token.text.lower() in modal_verbs]
        
        # If we found verbs or modal verbs, assume this sentence contains a task
        if verbs:
            tasks.append(sentence)
    
    return tasks

tasks = identify_tasks(cleaned_text)
print("Identified Tasks:", tasks)

 ['rahul', 'should', 'clean', 'his', 'room', 'by', '5', 'pm', 'today']
Identified Tasks: [" ['rahul', 'should', 'clean', 'his', 'room', 'by', '5', 'pm', 'today']"]


# implementing LDA

In [10]:
from gensim import corpora
from gensim.models import LdaModel

# Tokenize and create a dictionary and corpus
texts = [[word for word in doc.lower().split()] for doc in tasks]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Train the LDA model
lda = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

# Print the topics
topics = lda.print_topics(num_words=4)
for topic in topics:
    print(topic)


(0, '0.111*"\'5\'," + 0.111*"\'today\']" + 0.111*"\'clean\'," + 0.111*"[\'rahul\',"')
(1, '0.111*"\'by\'," + 0.111*"\'5\'," + 0.111*"\'room\'," + 0.111*"\'today\']"')
(2, '0.111*"\'today\']" + 0.111*"\'by\'," + 0.111*"\'pm\'," + 0.111*"[\'rahul\',"')
(3, '0.111*"\'his\'," + 0.111*"\'should\'," + 0.111*"\'room\'," + 0.111*"\'pm\',"')
(4, '0.111*"\'today\']" + 0.111*"[\'rahul\'," + 0.111*"\'clean\'," + 0.111*"\'by\',"')


# Categorizing

In [11]:
import re
# Extract names and deadlines
def extract_task_info(text):
    doc = nlp(text)
    
    # Extract responsible person's name (Proper Nouns)
    names = [ent.text for ent in doc if ent.ent_type_ == "PERSON"]
    responsible_person = names[0] if names else "Not specified"
    
    # Extract deadline (words like "tomorrow", "by Friday", etc.)
    deadline_patterns = r'\b(tomorrow|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|next week|next month|by \w+ \d*|at \d+ (AM|PM))\b'
    deadline_match = re.search(deadline_patterns, text, re.IGNORECASE)
    deadline = deadline_match.group() if deadline_match else "No deadline"

    return responsible_person, deadline

In [12]:
# Assign categories to tasks
def categorize_task(task_text):
    bow_vector = dictionary.doc2bow(task_text.lower().split())
    topic_distribution = lda[bow_vector]
    
    # Get the highest probability topic
    topic_num = max(topic_distribution, key=lambda x: x[1])[0]
    
    category_mapping = {
        0: "Meetings & Reviews",
        1: "Housekeeping & Maintenance",
        2: "Project & Client Work"
    }
    
    return category_mapping.get(topic_num, "Uncategorized")

# Generate structured task list
structured_tasks = []

for task in tasks:
    responsible_person, deadline = extract_task_info(task)
    category = categorize_task(task)
    structured_tasks.append({
        "Task": task,
        "Category": category,
        "Assigned To": responsible_person,
        "Deadline": deadline
    })

# Print structured task list
for task_info in structured_tasks:
    print(f"Task: {task_info['Task']}")
    print(f"Category: {task_info['Category']}")
    print(f"Assigned To: {task_info['Assigned To']}")
    print(f"Deadline: {task_info['Deadline']}\n")

Task:  ['rahul', 'should', 'clean', 'his', 'room', 'by', '5', 'pm', 'today']
Category: Uncategorized
Assigned To: Not specified
Deadline: No deadline

