imports

In [22]:
import random
from datetime import datetime,timedelta
import csv
from faker import Faker
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

## Goal: generate a list of task with the people on it and the deadline of each task

### First let's just try to determine the duration of a task

As no dataset lets generate our own one

In [None]:
#task : desc,type,deadline,technologies,team,duration(the output that we try to guess)
#start: date of the task creation
#type : "bug", "feature", "refactor", "test", "documentation"
#technologies : "python", "java", "c++", "c#", "javascript", "ruby", "php", "html", "css", "sql", "nosql"
#team : number of people in the team
#priority : 0,1,2(0 :"low", 1:"medium", 2:"high")
#duration : in days
#deadline : a date in the future
#desc : a string

#generate a random but credible dataset of task

def generate_dataset(sample):
    
    fake = Faker()
    tasks = []
    for i in range(sample):
        min_duration = 1
        task = {}
        task["desc"] = fake.text()
        task["type"] = random.choice(["bug", "feature", "refactor", "test", "documentation"])
        task["start"] = (datetime.now()+ timedelta(days=random.randint(0,3))).strftime("%Y-%m-%d")
        #documentation tasks are only about documentation
        if task["type"] == "documentation":
            task["technologies"] = ["doc"]
        else:
            task["technologies"] = random.sample(["python", "java", "c++", "c#", "web", "sql", "nosql"], k=1)
            # task["technologies"] = random.sample(["python", "java", "c++", "c#", "web", "sql", "nosql",], k=random.randint(1, 2))
        task["team"] = random.randint(1, 10)
        task["priority"] = random.randint(0, 2)
        # if task["team"]< len(task["technologies"]):
        #     min_duration += (len(task["technologies"]) - task["team"])*5 #case more technologies than team members
        if "c++" in task["technologies"]:
            min_duration += 1 #because c++ you know...
        if "documentation" not in task["type"]:
            min_duration += 5 #documentation is easier
        if len(task["desc"]) > 100:
            min_duration += len(task["desc"])//30 #we can imagine than longer description = more time
        
        
        #task["deadline"] should be at least min_duration days in the future
        start_date = datetime.now()
        min_deadline_date = start_date + timedelta(days=min_duration)
        #max_deadline change with the priority
        if task["priority"] == 0:
            max_duration = min_duration + 15 #low priority tasks are less urgent
        elif task["priority"] == 1:
            max_duration = min_duration + 10
        else:
            max_duration = min_duration + 5 #high priority tasks are more urgent
        max_deadline_date = start_date + timedelta(days=max_duration)
        task["deadline"] = fake.date_time_between_dates(datetime_start=min_deadline_date, datetime_end=max_deadline_date).strftime("%Y-%m-%d")
        task["duration"] = random.randint(min_duration, max_duration)
        tasks.append(task)
    with open("./data/tasks.csv", "w") as f:
        writer = csv.DictWriter(f, fieldnames=["desc", "type", "start","deadline", "technologies", "team","priority", "duration"])
        writer.writeheader()
        for task in tasks:
            writer.writerow(task)
    return tasks

generate_dataset(5000)

### explore the dataset

In [None]:
df = pd.read_csv("./data/tasks.csv")	
df.describe()

encoded data

In [None]:
df_encoded = df.copy()



#encode the type (one to many)
df_encoded = pd.get_dummies(df_encoded, columns=["type"])

#encode the technologies (many to many)
df_encoded = pd.get_dummies(df_encoded, columns=["technologies"])

#encode the start date
df_encoded['start'] = pd.to_datetime(df_encoded['start'])
df_encoded['start'] = df_encoded['start'].map(datetime.toordinal)
#encode the deadline
df_encoded['deadline'] = pd.to_datetime(df_encoded['deadline'])
df_encoded['deadline'] = df_encoded['deadline'].map(datetime.toordinal)

#ensure the numeric value of start and deadline

df_encoded['start'] = pd.to_numeric(df_encoded['start'])
df_encoded['deadline'] = pd.to_numeric(df_encoded['deadline'])

df_encoded

In [None]:
#correlation matrix
df_without_desc  = df_encoded.drop(columns=["desc"]) #cause too hard for the correlation matrix to generate
corr_matrix = df_without_desc.corr()
print(corr_matrix)
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix,annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

correlation with the output

In [None]:
#correlation between every feature and the duration
correlation = corr_matrix["duration"].sort_values(ascending=False)
correlation = correlation[correlation.index != "duration"]
print(correlation)

### Try a model to determine the duration

Decision Tree Regression

In [31]:
#split data between training and test sets

X = df_encoded.drop(columns=["duration", "desc"])
y = df_encoded["duration"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#lets use decision tree regressor
tree_reg = DecisionTreeRegressor()



#Function in order to round the prediction to the nearest integer, in order to be use in the GridSearchCV and cross validation
def custom_mse(y_true, y_pred):
    y_pred_rounded = np.round(y_pred).astype(int)
    return mean_squared_error(y_true, y_pred_rounded)

# Wrap the custom scorer with make_scorer
custom_scorer = make_scorer(custom_mse, greater_is_better=False)


#grid search
param_grid = [
    {'max_depth': [2,3,4,5,10,15,20,30], 'min_samples_split': [2,3,4,5,10,15,20,30], 'min_samples_leaf': [1,2,5,10,15,20,30]}
]
grid_search = GridSearchCV(tree_reg, param_grid, cv=5, scoring=custom_scorer, return_train_score=True)
grid_search.fit(X_train, y_train)
print("Best parameters : ", grid_search.best_params_)
#final model
final_model = grid_search.best_estimator_
final_predictions = final_model.predict(X_test)
#round predictions to the nearest integer because duration is an integer
final_predictions = np.round(final_predictions).astype(int)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

print("Final RMSE : ", final_rmse)

#cross validation
scores = cross_val_score(final_model, X_train, y_train, scoring=custom_scorer, cv=10)
final_rmse_scores = np.sqrt(-scores)
print("Final RMSE scores : ", final_rmse_scores)
print("Final RMSE scores mean : ", final_rmse_scores.mean())
print("Final RMSE scores std : ", final_rmse_scores.std())

#now lets put the rounded predict duration in the dataset

df_encoded["predicted_duration_with_tree"] = np.round(final_model.predict(X)).astype(int)
df_encoded["diff_with_tree"] = df_encoded["duration"] - df_encoded["predicted_duration"]
df_encoded["diff_with_tree"] = df_encoded["diff_with_tree"].abs()
#save it to a csv
df_encoded.to_csv("./data/tasks_with_predictions.csv", index=False)

Best parameters :  {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 30}
Final RMSE :  1.382389236069205
Final RMSE scores :  [1.58113883 1.14345966 0.98234414 0.8093207  0.82764727 1.39014388
 1.07354553 1.11915146 1.24599358 1.38744369]
Final RMSE scores mean :  1.1560188739974637
Final RMSE scores std :  0.23742443631950816
