In [20]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 8.3 MB/s eta 0:00:02
     -------- ------------------------------- 2.6/12.8 MB 8.9 MB/s eta 0:00:02
     ---------- ----------------------------- 3.4/12.8 MB 6.3 MB/s eta 0:00:02
     ----------------- ---------------------- 5.5/12.8 MB 7.6 MB/s eta 0:00:01
     ----------------------- ---------------- 7.6/12.8 MB 8.2 MB/s eta 0:00:01
     --------------------------- ------------ 8.9/12.8 MB 8.0 MB/s eta 0:00:01
     ----------------------------------- ---- 11.3/12.8 MB 8.4 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 8.4 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m[+] Download and in


[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [22]:
# Cell 1 - Import libraries for Part A
import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from datetime import datetime
import re


In [23]:
nlp = spacy.load('en_core_web_sm')

In [54]:

class TaskExtractor:
    def __init__(self):
        # Expanded action verbs with more task-related terms
        self.action_verbs = set([
            'need', 'must', 'should', 'have to', 'has to', 'required',
            'schedule', 'complete', 'review', 'submit', 'prepare',
            'create', 'update', 'finish', 'deliver', 'implement',
            'organize', 'coordinate', 'develop', 'ensure', 'plan'
        ])
        
        # Expanded time indicators
        self.time_indicators = set([
            'today', 'tomorrow', 'tonight', 'next', 'by', 'before', 'after',
            'monday', 'tuesday', 'wednesday', 'thursday', 'friday',
            'saturday', 'sunday', 'week', 'month', 'asap', 'soon',
            'morning', 'afternoon', 'evening', 'noon', 'midnight'
        ])
        
        # Task categories with associated keywords
        self.categories = {
            'Work': ['report', 'project', 'document', 'review', 'develop', 'implement'],
            'Meeting': ['meet', 'schedule', 'discuss', 'call', 'presentation'],
            'Administrative': ['submit', 'file', 'organize', 'update', 'process'],
            'Personal': ['buy', 'get', 'bring', 'clean', 'pick'],
            'Deadline': ['urgent', 'asap', 'immediately', 'priority']
        }

    def preprocess_text(self, text):
        """Clean and preprocess the input text with improved error handling."""
        try:
            # Remove extra whitespace and normalize
            text = re.sub(r'\s+', ' ', text.strip())
            doc = nlp(text)
            
            # Extract meaningful sentences and filter out noise
            sentences = []
            for sent in doc.sents:
                cleaned_sent = sent.text.strip()
                if (len(cleaned_sent.split()) >= 3 and  # Minimum word requirement
                    any(token.pos_ in ['VERB', 'AUX'] for token in sent)):  # Contains verb
                    sentences.append(cleaned_sent)
            
            return sentences
        except Exception as e:
            print(f"Error in preprocessing: {str(e)}")
            return []

    def is_task(self, sentence):
        """Enhanced task detection with multiple heuristics."""
        if not sentence:
            return False
        
        try:
            doc = nlp(sentence.lower())
            
            # Multiple task indicators
            indicators = {
                'has_action_verb': any(token.lemma_ in self.action_verbs for token in doc),
                'starts_with_verb': doc[0].pos_ == 'VERB' if len(doc) > 0 else False,
                'has_future_tense': any(token.dep_ == 'aux' and token.lemma_ in ['will', 'shall'] 
                                      for token in doc),
                'has_modal_verb': any(token.dep_ == 'aux' and token.lemma_ in ['must', 'should', 'need'] 
                                    for token in doc),
                'has_to_infinitive': any(token.dep_ == 'aux' and token.head.lemma_ == 'have' 
                                       for token in doc),
                'has_deadline': any(word in sentence.lower() for word in self.time_indicators)
            }
            
            # Task confidence score based on indicators
            confidence_score = sum(indicators.values())
            return confidence_score >= 1  # Require at least one strong indicator
            
        except Exception as e:
            print(f"Error in task detection: {str(e)}")
            return False

    def extract_entity_and_deadline(self, sentence):
        """Improved entity and deadline extraction."""
        try:
            doc = nlp(sentence)
            
            # Entity extraction
            entities = []
            for ent in doc.ents:
                if ent.label_ == 'PERSON':
                    entities.append(ent.text)
            
            # Look for subject if no named entity found
            if not entities:
                for token in doc:
                    if token.dep_ == 'nsubj' and token.pos_ == 'PROPN':
                        entities.append(token.text)
            
            # Deadline extraction with context
            deadline = None
            doc_lower = sentence.lower()
            
            # Pattern matching for deadline phrases
            deadline_patterns = [
                r'by\s+([\w\s]+)',
                r'before\s+([\w\s]+)',
                r'due\s+(?:by|on)?\s+([\w\s]+)',
                r'until\s+([\w\s]+)'
            ]
            
            for pattern in deadline_patterns:
                match = re.search(pattern, doc_lower)
                if match:
                    deadline = match.group(1).strip()
                    break
            
            # Fallback to time indicators
            if not deadline:
                for token in doc:
                    if token.text.lower() in self.time_indicators:
                        deadline = ' '.join([t.text for t in token.subtree])
                        break
            
            return entities[0] if entities else None, deadline
            
        except Exception as e:
            print(f"Error in entity/deadline extraction: {str(e)}")
            return None, None

    def determine_category(self, task_text):
        """Determine task category based on keyword matching."""
        task_lower = task_text.lower()
        category_scores = {cat: 0 for cat in self.categories}
        
        for category, keywords in self.categories.items():
            for keyword in keywords:
                if keyword in task_lower:
                    category_scores[category] += 1
        
        # Return category with highest score, default to 'Work'
        max_score = max(category_scores.values())
        if max_score == 0:
            return 'Work'
        return max(category_scores.items(), key=lambda x: x[1])[0]

    def extract_tasks(self, text):
        """Main function to extract and categorize tasks with improved structure."""
        try:
            sentences = self.preprocess_text(text)
            tasks = []
            
            for sentence in sentences:
                if self.is_task(sentence):
                    entity, deadline = self.extract_entity_and_deadline(sentence)
                    category = self.determine_category(sentence)
                    
                    task = {
                        'task': sentence,
                        'assignee': entity,
                        'deadline': deadline,
                        'category': category,
                        'priority': 'High' if any(urgent in sentence.lower() 
                                                for urgent in ['urgent', 'asap', 'immediately']) 
                                  else 'Normal'
                    }
                    tasks.append(task)
            
            return tasks
            
        except Exception as e:
            print(f"Error in task extraction: {str(e)}")
            return []

In [55]:
# Initialize spacy
nlp = spacy.load('en_core_web_sm')

# Test the implementation
test_text = """
Rahul has to buy snacks for all of us by 5pm today. 
Sarah needs to review the quarterly report before Monday. 
The weather is nice outside. 
James must schedule the team meeting for next week.
We should complete the project documentation by Friday.
Please submit the expense reports urgently.
"""

# Initialize and test
extractor = TaskExtractor()
tasks = extractor.extract_tasks(test_text)

# Print results
print("\nExtracted Tasks:")
for task in tasks:
    print(f"\nTask: {task['task']}")
    print(f"Assignee: {task['assignee']}")
    print(f"Deadline: {task['deadline']}")
    print(f"Category: {task['category']}")
    print(f"Priority: {task['priority']}")


Extracted Tasks:

Task: Rahul has to buy snacks for all of us by 5pm today.
Assignee: Rahul
Deadline: 5pm today
Category: Personal
Priority: Normal

Task: Sarah needs to review the quarterly report before Monday.
Assignee: Sarah
Deadline: monday
Category: Work
Priority: Normal

Task: James must schedule the team meeting for next week.
Assignee: James
Deadline: next
Category: Meeting
Priority: Normal

Task: We should complete the project documentation by Friday.
Assignee: None
Deadline: friday
Category: Work
Priority: Normal

Task: Please submit the expense reports urgently.
Assignee: None
Deadline: None
Category: Work
Priority: High
