In [1]:
# Run this cell only once if needed
!pip install pandas scikit-learn numpy joblib



In [2]:
import pandas as pd

df = pd.read_csv("tasks_dataset.csv")
print(df.head())

                    text priority    due_date  category created_date  \
0  Submit project report     High  2025-07-20      Work   2025-07-15   
1          Buy groceries      Low  2025-07-22  Personal   2025-07-20   
2  Finalize presentation   Medium  2025-07-25      Work   2025-07-21   
3   Pay electricity bill     High  2025-07-18     Bills   2025-07-15   

  completed_date  
0     2025-07-19  
1     2025-07-21  
2     2025-07-26  
3     2025-07-19  


In [3]:
import os
os.makedirs("ml", exist_ok=True)

In [11]:
# tasknest_ml.ipynb

# 0. Setup: Install required libraries if needed
# Uncomment and run the below if missing
# !pip install pandas scikit-learn numpy joblib

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error
import numpy as np
import joblib
from datetime import timedelta

In [13]:
# 1. Load Task Dataset (replace with your data export from TaskNest)


In [14]:
df = pd.read_csv('tasks_dataset.csv')

In [15]:
# expect fields: text, priority, due_date (YYYY-MM-DD), category, created_date, completed_date

In [16]:
# Simple engineering

In [17]:
df['text'] = df['text'].astype(str)
df['priority'] = df['priority'].fillna('Low')

In [18]:
# 2. Priority Prediction (classification)

In [19]:
vectorizer_prio = TfidfVectorizer(max_features=1000, ngram_range=(1,2))
X_prio = vectorizer_prio.fit_transform(df['text'])
y_prio = df['priority']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_prio, y_prio, test_size=0.2, random_state=42)
clf_prio = LogisticRegression(max_iter=200)
clf_prio.fit(X_train, y_train)
y_pred = clf_prio.predict(X_test)
print("Priority Accuracy:", accuracy_score(y_test, y_pred)

_IncompleteInputError: incomplete input (3469349068.py, line 5)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_prio, y_prio, test_size=0.2, random_state=42)
clf_prio = LogisticRegression(max_iter=200)
clf_prio.fit(X_train, y_train)
y_pred = clf_prio.predict(X_test)
print("Priority Accuracy:", accuracy_score(y_test, y_pred))

Priority Accuracy: 0.0


In [22]:
joblib.dump(vectorizer_prio, 'ml/vectorizer_priority.pkl')
joblib.dump(clf_prio, 'ml/model_priority.pkl')

['ml/model_priority.pkl']

In [23]:
# 3. Deadline Estimation (regression)

In [24]:
df_d = df.dropna(subset=['due_date'])
df_d['due_days'] = (pd.to_datetime(df_d['due_date']) - pd.to_datetime(df_d['created_date'])).dt.days
X_dead = vectorizer_prio.transform(df_d['text'])
y_dead = df_d['due_days']

In [25]:
X_train, X_test, y_train_d, y_test_d = train_test_split(X_dead, y_dead, test_size=0.2, random_state=42)
reg_dead = LinearRegression()
reg_dead.fit(X_train, y_train_d)
y_pred_d = reg_dead.predict(X_test)
print("Deadline MAE (days):", mean_absolute_error(y_test_d, y_pred_d))

Deadline MAE (days): 2.0


In [26]:
joblib.dump(reg_dead, 'ml/model_deadline.pkl')

['ml/model_deadline.pkl']

In [27]:
# 4. Overdue Risk Prediction (binary classification)

In [28]:
df_o = df.dropna(subset=['due_date','completed_date'])
df_o['overdue'] = (
    pd.to_datetime(df_o['completed_date']) > pd.to_datetime(df_o['due_date'])
).astype(int)
X_over = vectorizer_prio.transform(df_o['text'])
y_over = df_o['overdue']

In [29]:
X_train, X_test, y_train_o, y_test_o = train_test_split(X_over, y_over, test_size=0.2, random_state=42)
clf_over = LogisticRegression(max_iter=200)
clf_over.fit(X_train, y_train_o)
print("Overdue Risk Accuracy:", accuracy_score(y_test_o, clf_over.predict(X_test)))

Overdue Risk Accuracy: 0.0


In [30]:
joblib.dump(clf_over, 'ml/model_overdue.pkl')

['ml/model_overdue.pkl']

In [31]:
# 5. Smart Categorization

In [32]:
from sklearn.preprocessing import LabelEncoder

In [33]:
le_cat = LabelEncoder()
y_cat = le_cat.fit_transform(df['category'].astype(str))
X_cat = vectorizer_prio.transform(df['text'])


In [34]:
X_train, X_test, y_train_c, y_test_c = train_test_split(X_cat, y_cat, test_size=0.2, random_state=42)
clf_cat = LogisticRegression(max_iter=200)
clf_cat.fit(X_train, y_train_c)
print("Category Prediction Accuracy:", accuracy_score(y_test_c, clf_cat.predict(X_test)))

Category Prediction Accuracy: 0.0


In [35]:
joblib.dump(le_cat, 'ml/label_encoder_cat.pkl')
joblib.dump(clf_cat, 'ml/model_category.pkl')

['ml/model_category.pkl']