In [43]:
import pandas as pd
import random
from datetime import datetime, timedelta
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

random.seed(42)
np.random.seed(42)

users = ['alice','bob','charlie','diana']
titles = [
    "Fix login bug", "Design landing page", "Prepare monthly report",
    "Deploy microservice", "Update dependencies", "Customer email reply",
    "Optimize DB queries", "Write unit tests", "Research new lib", "Plan sprint"
]
descriptions = [
    "User cannot login with social auth. Error 500 observed on POST /auth.",
    "Create responsive landing page for product X using Figma mockups.",
    "Gather sales data and prepare report for stakeholders by Friday.",
    "Deploy new service to production cluster and monitor for errors.",
    "Update package.json dependencies and run compatibility tests.",
    "Respond to important client email about billing dispute.",
    "Optimize slow queries causing high CPU usage on replica DB.",
    "Write unit tests for payment module to improve coverage.",
    "Investigate new ML library for text embeddings and feasibility.",
    "Plan sprint items and assign tasks to frontend/backend teams."
]

def rand_date(start, end):
    diff = end - start
    return start + timedelta(days=random.randint(0, diff.days))

rows = []
start = datetime.now() - timedelta(days=60)
end = datetime.now() + timedelta(days=30)
for i in range(1000):
    title = random.choice(titles)
    desc = random.choice(descriptions)
    created = rand_date(start, datetime.now())
    due = created + timedelta(days=random.randint(1, 20))
    creator = random.choice(users)
    assignee = random.choice(users)
    # Priority synthetic label (0 low,1 medium,2 high) using heuristics
    days_to_due = (due - created).days
    if "bug" in title.lower() or "deploy" in title.lower():
        priority = 2
    elif days_to_due <= 3:
        priority = 2
    elif days_to_due <= 7:
        priority = 1
    else:
        priority = 0
    workload_hours = random.choice([1,2,3,4,6,8,12])
    # user past behaviour features (synthetic)
    user_completed = random.randint(5,200)
    avg_completion_days = round(random.uniform(0.5,5.0),2)
    rows.append({
        "task_id": i+1,
        "title": title,
        "description": desc,
        "created_at": created,
        "due_date": due,
        "creator": creator,
        "assignee": assignee,
        "priority": priority,
        "workload_hours": workload_hours,
        "assignee_completed_tasks": user_completed,
        "assignee_avg_completion_days": avg_completion_days,
        "status": random.choice(["open","in_progress","done"])
    })

df = pd.DataFrame(rows)
df.to_csv("tasks_synthetic.csv", index=False)
print("Saved tasks_synthetic.csv, shape:", df.shape)


Saved tasks_synthetic.csv, shape: (1000, 12)


In [28]:
df = pd.read_csv('tasks_synthetic.csv')

In [29]:
df.head()


Unnamed: 0,task_id,title,description,created_at,due_date,creator,assignee,priority,workload_hours,assignee_completed_tasks,assignee_avg_completion_days,status
0,1,Design landing page,User cannot login with social auth. Error 500 ...,2025-09-30 22:23:02.083979,2025-10-09 22:23:02.083979,bob,bob,0,2,193,0.96,done
1,2,Research new lib,Create responsive landing page for product X u...,2025-09-20 22:23:02.083979,2025-10-04 22:23:02.083979,alice,alice,0,1,60,1.55,done
2,3,Fix login bug,Investigate new ML library for text embeddings...,2025-08-26 22:23:02.083979,2025-09-13 22:23:02.083979,diana,bob,2,4,155,1.75,open
3,4,Prepare monthly report,Optimize slow queries causing high CPU usage o...,2025-09-04 22:23:02.083979,2025-09-13 22:23:02.083979,bob,bob,0,12,91,0.96,in_progress
4,5,Design landing page,Respond to important client email about billin...,2025-10-07 22:23:02.083979,2025-10-19 22:23:02.083979,charlie,alice,0,8,122,2.91,in_progress


In [31]:
df.shape

(1000, 12)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   task_id                       1000 non-null   int64  
 1   title                         1000 non-null   object 
 2   description                   1000 non-null   object 
 3   created_at                    1000 non-null   object 
 4   due_date                      1000 non-null   object 
 5   creator                       1000 non-null   object 
 6   assignee                      1000 non-null   object 
 7   priority                      1000 non-null   int64  
 8   workload_hours                1000 non-null   int64  
 9   assignee_completed_tasks      1000 non-null   int64  
 10  assignee_avg_completion_days  1000 non-null   float64
 11  status                        1000 non-null   object 
dtypes: float64(1), int64(4), object(7)
memory usage: 93.9+ KB


In [33]:
df.duplicated().sum()

0

In [34]:
df.describe()

Unnamed: 0,task_id,priority,workload_hours,assignee_completed_tasks,assignee_avg_completion_days
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,0.722,5.322,102.865,2.71938
std,288.819436,0.881756,3.650705,56.331306,1.293874
min,1.0,0.0,1.0,5.0,0.5
25%,250.75,0.0,2.0,53.0,1.57
50%,500.5,0.0,4.0,107.0,2.72
75%,750.25,2.0,8.0,150.25,3.82
max,1000.0,2.0,12.0,200.0,4.99


In [37]:
df.isnull().sum()

task_id                         0
title                           0
description                     0
created_at                      0
due_date                        0
creator                         0
assignee                        0
priority                        0
workload_hours                  0
assignee_completed_tasks        0
assignee_avg_completion_days    0
status                          0
dtype: int64

In [38]:
# Derived features
df['days_to_due'] = (df['due_date'] - df['created_at']).dt.days
df['is_overdue'] = np.where((df['status'] != 'done') & (df['due_date'] < pd.Timestamp.now()), 1, 0)

In [39]:
#EDA Plot
plt.figure(figsize=(6,4))
sns.countplot(x='priority', data=df)
plt.title("Priority distribution")
plt.savefig("priority_dist.png")
plt.close()

In [55]:
plt.figure(figsize=(6,4))
sns.histplot(df['workload_hours'], bins=10, kde=False)
plt.title("Workload hours distribution")
plt.savefig("workload_dist.png")
plt.close()

In [40]:
# Correlation with numeric features
num_cols = ['priority','workload_hours','assignee_completed_tasks','assignee_avg_completion_days','days_to_due','is_overdue']
corr = df[num_cols].corr()
print("Correlation matrix:\n", corr)


Correlation matrix:
                               priority  workload_hours  \
priority                      1.000000        0.013532   
workload_hours                0.013532        1.000000   
assignee_completed_tasks      0.015789        0.053599   
assignee_avg_completion_days  0.033813        0.012228   
days_to_due                  -0.574874        0.011380   
is_overdue                    0.094929        0.006601   

                              assignee_completed_tasks  \
priority                                      0.015789   
workload_hours                                0.053599   
assignee_completed_tasks                      1.000000   
assignee_avg_completion_days                 -0.003735   
days_to_due                                  -0.012581   
is_overdue                                    0.016839   

                              assignee_avg_completion_days  days_to_due  \
priority                                          0.033813    -0.574874   
workload_hours 

In [42]:
 #Save cleaned version for next steps (drop duplicates)
df_clean = df.drop_duplicates(subset=['task_id']).reset_index(drop=True)
df_clean.to_csv("tasks_cleaned.csv", index=False)
print("Cleaned saved:", df_clean.shape)

Cleaned saved: (1000, 14)


### NLP preprocessing

In [48]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nirap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nirap\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nirap\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nirap\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [52]:
df = pd.read_csv("tasks_cleaned.csv")
ps = PorterStemmer()
wn = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [53]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nirap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nirap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [54]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+','', text)              
    text = re.sub(r'[^a-z0-9\s]',' ', text)     
    text = re.sub(r'\s+',' ', text).strip()
    return text

def preprocess(text):
    t = clean_text(text)
    tokens = word_tokenize(t)
    tokens = [w for w in tokens if w not in stop_words and len(w)>1]
    stems = [ps.stem(w) for w in tokens]
    lemmas = [wn.lemmatize(w) for w in tokens]
    return {
        "tokens": tokens,
        "stems": " ".join(stems),
        "lemmas": " ".join(lemmas)
    }

# Apply preprocessing on description + title
df['text_raw'] = df['title'] + ". " + df['description']
proc = df['text_raw'].apply(preprocess)
df['tokens'] = proc.apply(lambda x: x['tokens'])
df['stems'] = proc.apply(lambda x: x['stems'])
df['lemmas'] = proc.apply(lambda x: x['lemmas'])

df.to_csv("tasks_preprocessed.csv", index=False)
print("Preprocessing done. Saved tasks_preprocessed.csv")

Preprocessing done. Saved tasks_preprocessed.csv
