In [2]:
import pandas as pd
import numpy as np

# Generate time series data
np.random.seed(42)
date_range = pd.date_range(start='2021-01-01', end='2024-12-31', freq='D')
days = len(date_range)

# Define base workload values (random workload between 1 and 3, simulating light days)
workload = np.random.normal(loc=2.5, scale=0.5, size=days)

# Introduce seasonality for peaks (amplified workload) around March and September
seasonal_factor = np.sin(2 * np.pi * date_range.dayofyear / 365) * 1.5  # Yearly seasonality
workload += seasonal_factor

# Introduce peaks during March (mid-terms) and September (end-semester)
for month in [3, 9]:  # March and September peaks
    month_mask = (date_range.month == month)
    
    # Add random peak workload during exam periods
    workload[month_mask] += np.random.normal(2.5, 0.5, sum(month_mask))
    
    # Introduce random drops after exams
    workload[month_mask & (date_range.day > 20)] = np.random.uniform(0, 1.5, sum(month_mask & (date_range.day > 20)))

# Simulate some "rest" periods after exams (e.g., after March, September)
for rest_month in [4, 10]:
    rest_mask = (date_range.month == rest_month) & (date_range.day < 10)
    workload[rest_mask] = np.random.uniform(0, 1, sum(rest_mask))  # Minimal workload after exams

# Introduce random fluctuations even in low-workload months
random_fluctuation = np.random.normal(0, 0.2, size=days)
workload += random_fluctuation

# Clip the workload to the scale of 0 to 5 (0 represents no workload)
workload = np.clip(workload, 0, 5)

# Round the 'Workload' column to 1 decimal place
workload = workload.round(1)

# Create DataFrame
workload_data = pd.DataFrame({
    'Date': date_range,
    'Workload': workload
})

# Save the dataset to a CSV file
workload_data.to_csv('workload_time_series.csv', index=False)

print("Dataset saved as 'workload_data.csv'")


Dataset saved as 'workload_data.csv'


In [2]:
import pandas as pd
import numpy as np

# Parameters for dataset generation
np.random.seed(42)  # For reproducibility
n_rows = 100

# Generating random data for features
difficulty = np.random.randint(1, 6, n_rows)  # Difficulty between 1-5
total_questions = np.random.randint(5, 21, n_rows)  # Total number of questions (5-20)

# Binary columns for short and long question types
short_questions_included = np.random.choice([0, 1], n_rows)  # 0 = No, 1 = Yes
long_questions_included = np.random.choice([0, 1], n_rows)  # 0 = No, 1 = Yes

# Target: Proportion of long questions (0 to 1)
# Assuming long_questions_proportion is influenced by the difficulty, total_questions, and long_questions_included
long_question_proportion = np.where(long_questions_included == 1, 
                                    np.clip(difficulty / 10 + np.random.normal(0, 0.1, n_rows), 0, 1), 
                                    0)

# Round Long_Question_Proportion to 2 decimal places
long_question_proportion = np.round(long_question_proportion, 2)

# Creating the dataset
data = pd.DataFrame({
    'Question_Difficulty': difficulty,
    'Total_Questions': total_questions,
    'Short_Questions_Included': short_questions_included,
    'Long_Questions_Included': long_questions_included,
    'Long_Question_Proportion': long_question_proportion
})

# Display first few rows of the generated dataset
print(data.head())

# Save the dataset to a CSV file
data.to_csv('dataset.csv', index=False)


   Question_Difficulty  Total_Questions  Short_Questions_Included  \
0                    4               16                         0   
1                    5               11                         0   
2                    3               13                         1   
3                    5               12                         0   
4                    5               16                         0   

   Long_Questions_Included  Long_Question_Proportion  
0                        0                      0.00  
1                        1                      0.42  
2                        0                      0.00  
3                        0                      0.00  
4                        1                      0.46  


In [5]:
import pandas as pd
import numpy as np

# Set the number of rows to generate
num_rows = 500

# Define ranges and distributions for each column
np.random.seed(42)  # For reproducible results

data = {
    "assignment_difficulty": np.random.randint(1, 6, num_rows),  # Difficulty between 1 and 5
    "active_assignments_count": np.random.randint(0, 8, num_rows),  # Active assignments between 0 and 7
    "question_type_distribution": np.round(np.random.uniform(0.4, 0.75, num_rows), 2),  # Distribution between 0.4 and 0.75
    "workload": np.round(np.random.uniform(1.5, 5.0, num_rows), 2),  # Workload between 1.5 and 5.0
    "historical_avg_completion_time": np.round(np.random.uniform(3.5, 7.0, num_rows), 2)  # Completion time between 3.5 and 7.0
}

# Create a DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('synthetic_assignment_data.csv', index=False)
print("Generated 500 rows of synthetic data and saved to 'synthetic_assignment_data.csv'")

Generated 500 rows of synthetic data and saved to 'synthetic_assignment_data.csv'
