In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

def generate_procrastination_data(num_rows=2000):
    """
    Generates a large, realistic synthetic dataset for the procrastination project.
    """
    print(f"Generating {num_rows} rows of synthetic data...")

    # --- 1. Define the Persona & Task Probabilities ---
    task_templates = {
        'Code': ['Work on FYP module', 'Debug API endpoint', 'Solve LeetCode problem', 'Refactor legacy code', 'Update project dependencies'],
        'Study/Research': ['Read research paper', 'Revise DBMS notes', 'Work on lab assignment', 'Prepare for quiz', 'Write documentation'],
        'Chore': ['Do laundry', 'Clean room', 'Grocery shopping', 'Cook dinner', 'Pay bills'],
        'Admin/Emails': ['Reply to emails', 'Plan the week', 'Update resume', 'Submit assignment online'],
        'Health/Fitness': ['Go for a run', 'Workout at the gym', 'Go for a walk', 'Meditate'],
        'Social': ['Call parents', 'Meet friends for dinner', 'Attend team meeting', 'Go to a movie']
    }
    
    # Define probabilities and characteristics for each category
    task_info = {
        'Code': {'prob': 0.30, 'avg_enjoyment': 3.5, 'avg_duration': 120},
        'Study/Research': {'prob': 0.30, 'avg_enjoyment': 2.0, 'avg_duration': 90},
        'Chore': {'prob': 0.15, 'avg_enjoyment': 1.5, 'avg_duration': 45},
        'Admin/Emails': {'prob': 0.10, 'avg_enjoyment': 2.0, 'avg_duration': 20},
        'Health/Fitness': {'prob': 0.08, 'avg_enjoyment': 4.0, 'avg_duration': 60},
        'Social': {'prob': 0.07, 'avg_enjoyment': 4.5, 'avg_duration': 75}
    }

    data = []
    
    # Start generating data from one year ago
    current_time = datetime.now() - timedelta(days=365)

    # --- 2. Main Generation Loop ---
    for i in range(num_rows):
        # Choose a task category based on probabilities
        category = random.choices(list(task_info.keys()), weights=[v['prob'] for v in task_info.values()])[0]
        
        # --- 3. Simulate Personal State ---
        hours_of_sleep = np.random.normal(7.0, 1.5) # Normally distributed around 7 hours
        
        # Energy is correlated with sleep
        energy_level = max(1, min(5, int(np.random.normal(2 + (hours_of_sleep / 4), 1))))
        
        # Mood is correlated with energy but has more randomness
        mood_level = max(1, min(5, int(np.random.normal(energy_level, 1.5))))
        
        # --- 4. Simulate Task Characteristics ---
        task_name = random.choice(task_templates[category])
        est_duration = max(10, int(np.random.normal(task_info[category]['avg_duration'], 20)))
        perceived_enjoyment = max(1, min(5, int(np.random.normal(task_info[category]['avg_enjoyment'], 1))))
        
        deadline_date = (current_time + timedelta(days=random.randint(1, 30))).date()
        deadline_proximity = (deadline_date - current_time.date()).days

        # --- 5. The Procrastination Logic ---
        # Start with a base delay, add penalties based on context
        delay_minutes = random.uniform(0, 20) # Base random delay
        
        # Penalty for low enjoyment & energy
        delay_minutes += (5 - perceived_enjoyment) * random.uniform(10, 20)
        delay_minutes += (5 - energy_level) * random.uniform(5, 15)
        
        # Penalty for certain task types
        if category in ['Study/Research', 'Chore']:
            delay_minutes += random.uniform(0, 30)
            
        # "Panic Monster": reduce delay if deadline is very close
        if deadline_proximity <= 2:
            delay_minutes *= 0.25 # Drastically reduce delay
        
        # --- 6. Finalize Timestamps ---
        actual_start_time = current_time
        planned_start_time = actual_start_time - timedelta(minutes=delay_minutes)
        
        # Append the generated row
        data.append({
            'Actual_Start_Time': actual_start_time,
            'Task_Name': task_name,
            'Task_Category': category,
            'Estimated_Duration_Mins': est_duration,
            'Deadline_Date': deadline_date,
            'Planned_Start_Time': planned_start_time,
            'Mood_Level_1_5': mood_level,
            'Energy_Level_1_5': energy_level,
            'Hours_of_Sleep': round(max(3, hours_of_sleep), 1),
            'Perceived_Enjoyment_1_5': perceived_enjoyment
        })
        
        # Increment time for the next task (simulates passage of time)
        current_time += timedelta(hours=random.uniform(2, 6))

    df = pd.DataFrame(data)
    print("Data generation complete.")
    return df

# --- Generate the data and save it to a CSV file ---
big_df = generate_procrastination_data(num_rows=2000)
big_df.to_csv('procrastination_big_data.csv', index=False)

print("\nDataset saved to 'procrastination_big_data.csv'")
print("Here's a sample of your new 'big data':")
print(big_df.head())

Generating 2000 rows of synthetic data...
Data generation complete.

Dataset saved to 'procrastination_big_data.csv'
Here's a sample of your new 'big data':
           Actual_Start_Time          Task_Name   Task_Category  \
0 2024-09-19 11:51:56.039245        Cook dinner           Chore   
1 2024-09-19 14:02:49.444877    Reply to emails    Admin/Emails   
2 2024-09-19 16:22:31.297570  Revise DBMS notes  Study/Research   
3 2024-09-19 22:01:12.119359   Prepare for quiz  Study/Research   
4 2024-09-20 01:01:13.292040          Pay bills           Chore   

   Estimated_Duration_Mins Deadline_Date         Planned_Start_Time  \
0                       10    2024-10-07 2024-09-19 10:01:49.541071   
1                       10    2024-10-19 2024-09-19 12:15:15.985310   
2                      103    2024-09-30 2024-09-19 14:55:28.019911   
3                      103    2024-10-02 2024-09-19 20:35:08.641878   
4                       44    2024-10-09 2024-09-19 23:12:07.358020   

   Mood_Level

In [2]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

def generate_procrastination_data_v2(num_rows=2000):
    """
    Generates a large synthetic dataset with more pronounced procrastination habits.
    """
    print(f"Generating {num_rows} rows with V2 logic...")

    # --- TUNING PARAMETER ---
    # Increase this value to make the persona more likely to procrastinate
    PROCRASTINATION_TENDENCY = 1.5 

    task_templates = {
        'Code': ['Work on FYP module', 'Debug API endpoint', 'Solve LeetCode problem', 'Refactor legacy code', 'Update project dependencies'],
        'Study/Research': ['Read research paper', 'Revise DBMS notes', 'Work on lab assignment', 'Prepare for quiz', 'Write documentation'],
        'Chore': ['Do laundry', 'Clean room', 'Grocery shopping', 'Cook dinner', 'Pay bills'],
        'Admin/Emails': ['Reply to emails', 'Plan the week', 'Update resume', 'Submit assignment online'],
        'Health/Fitness': ['Go for a run', 'Workout at the gym', 'Go for a walk', 'Meditate'],
        'Social': ['Call parents', 'Meet friends for dinner', 'Attend team meeting', 'Go to a movie']
    }
    
    task_info = {
        'Code': {'prob': 0.30, 'avg_enjoyment': 3.5, 'avg_duration': 120},
        'Study/Research': {'prob': 0.30, 'avg_enjoyment': 2.0, 'avg_duration': 90},
        'Chore': {'prob': 0.15, 'avg_enjoyment': 1.5, 'avg_duration': 45},
        'Admin/Emails': {'prob': 0.10, 'avg_enjoyment': 2.0, 'avg_duration': 20},
        'Health/Fitness': {'prob': 0.08, 'avg_enjoyment': 4.0, 'avg_duration': 60},
        'Social': {'prob': 0.07, 'avg_enjoyment': 4.5, 'avg_duration': 75}
    }

    data = []
    current_time = datetime.now() - timedelta(days=365)

    for i in range(num_rows):
        category = random.choices(list(task_info.keys()), weights=[v['prob'] for v in task_info.values()])[0]
        
        hours_of_sleep = np.random.normal(7.0, 1.5)
        energy_level = max(1, min(5, int(np.random.normal(2 + (hours_of_sleep / 4), 1))))
        mood_level = max(1, min(5, int(np.random.normal(energy_level, 1.5))))
        
        task_name = random.choice(task_templates[category])
        est_duration = max(10, int(np.random.normal(task_info[category]['avg_duration'], 20)))
        perceived_enjoyment = max(1, min(5, int(np.random.normal(task_info[category]['avg_enjoyment'], 1))))
        
        deadline_date = (current_time + timedelta(days=random.randint(1, 30))).date()
        deadline_proximity = (deadline_date - current_time.date()).days

        # --- ADJUSTED PROCRASTINATION LOGIC ---
        delay_minutes = random.uniform(0, 15)
        
        # INCREASED PENALTIES: Penalties are now higher and multiplied by our tendency factor
        delay_minutes += (5 - perceived_enjoyment) * random.uniform(15, 30) * PROCRASTINATION_TENDENCY
        delay_minutes += (5 - energy_level) * random.uniform(10, 20) * PROCRASTINATION_TENDENCY
        
        if category in ['Study/Research', 'Chore', 'Admin/Emails']:
            delay_minutes += random.uniform(10, 40)
            
        # SOFTENED PANIC MONSTER: Effect is less drastic and only applies when deadline is 1 day away
        if deadline_proximity <= 1:
            delay_minutes *= 0.5 
        
        actual_start_time = current_time
        planned_start_time = actual_start_time - timedelta(minutes=delay_minutes)
        
        data.append({
            'Actual_Start_Time': actual_start_time,
            'Task_Name': task_name,
            'Task_Category': category,
            'Estimated_Duration_Mins': est_duration,
            'Deadline_Date': deadline_date,
            'Planned_Start_Time': planned_start_time,
            'Mood_Level_1_5': mood_level,
            'Energy_Level_1_5': energy_level,
            'Hours_of_Sleep': round(max(3, hours_of_sleep), 1),
            'Perceived_Enjoyment_1_5': perceived_enjoyment
        })
        
        current_time += timedelta(hours=random.uniform(2, 6))

    df = pd.DataFrame(data)
    print("Data generation complete.")
    return df

# --- Generate the new data and save it ---
big_df_v2 = generate_procrastination_data_v2(num_rows=2000)
big_df_v2.to_csv('procrastination_big_data_v2.csv', index=False)

print("\nDataset saved to 'procrastination_big_data_v2.csv'")

Generating 2000 rows with V2 logic...
Data generation complete.

Dataset saved to 'procrastination_big_data_v2.csv'
