In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
from hmmlearn import hmm

# Load the helpdesk dataset
data = pd.read_csv('/mnt/data/helpdesk.csv')

# Display the first few rows of the dataframe
data.head()

# Preprocess the data
data['timestamp'] = pd.to_datetime(data['timestamp'])
data = data.sort_values(by=['case_id', 'timestamp'])

# Encode activities
activities = list(data['activity'].unique())
activity_map = {activity: idx for idx, activity in enumerate(activities)}
data['activity_code'] = data['activity'].map(activity_map)

# Split the data into train, validation, and test sets
case_ids = data['case_id'].unique()
train_ids, test_ids = train_test_split(case_ids, test_size=0.25, random_state=42)
train_ids, val_ids = train_test_split(train_ids, test_size=0.25, random_state=42)

train_data = data[data['case_id'].isin(train_ids)]
val_data = data[data['case_id'].isin(val_ids)]
test_data = data[data['case_id'].isin(test_ids)]

# Prepare sequences for training HMM
def prepare_sequences(data):
    sequences = []
    case_ids = data['case_id'].unique()
    for case_id in case_ids:
        case_data = data[data['case_id'] == case_id]
        activity_seq = case_data['activity_code'].values
        timestamp_seq = case_data['timestamp'].diff().fillna(pd.Timedelta(seconds=0)).dt.total_seconds().values[1:]
        sequences.append((activity_seq, timestamp_seq))
    return sequences

train_sequences = prepare_sequences(train_data)
val_sequences = prepare_sequences(val_data)
test_sequences = prepare_sequences(test_data)

# Train a Hidden Markov Model (HMM) on the sequences
model = hmm.GaussianHMM(n_components=10, covariance_type='diag', n_iter=100, random_state=42)
activity_lengths = [len(seq[0]) for seq in train_sequences]
activity_data = np.concatenate([seq[0] for seq in train_sequences])
timestamp_data = np.concatenate([seq[1] for seq in train_sequences])

model.fit(np.column_stack([activity_data, timestamp_data]), lengths=activity_lengths)

# Predict the next activity and timestamp for each test case
def predict_next_event(model, activity_seq, timestamp_seq):
    logprob, state_seq = model.decode(np.column_stack([activity_seq, timestamp_seq]))
    next_state = state_seq[-1]
    next_activity = model.transmat_[next_state].argmax()
    next_timestamp = model.means_[next_state, 1]
    return next_activity, next_timestamp

# Evaluate the model on the test set
predictions = []
for activity_seq, timestamp_seq in test_sequences:
    next_activity, next_timestamp = predict_next_event(model, activity_seq, timestamp_seq)
    predictions.append((next_activity, next_timestamp))

# Calculate accuracy and MAE
def calculate_metrics(predictions, test_sequences):
    true_activities = [seq[0][1] for seq in test_sequences]
    pred_activities = [pred[0] for pred in predictions]
    activity_accuracy = np.mean([true == pred for true, pred in zip(true_activities, pred_activities)])

    true_timestamps = [seq[1][0] for seq in test_sequences]
    pred_timestamps = [pred[1] for pred in predictions]
    timestamp_mae = np.mean([abs(true - pred) for true, pred in zip(true_timestamps, pred_timestamps)])

    return activity_accuracy, timestamp_mae

activity_accuracy, timestamp_mae = calculate_metrics(predictions, test_sequences)

print(f'Next Activity Prediction Accuracy: {activity_accuracy:.2f}')
print(f'Timestamp Prediction MAE: {timestamp_mae:.2f}')

# Visualize the results
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
sns.histplot([seq[1][0] for seq in test_sequences], ax=ax[0], label='True Timestamps')
sns.histplot([pred[1] for pred in predictions], ax=ax[0], label='Predicted Timestamps', color='red')
ax[0].set_title('Timestamp Prediction Distribution')
ax[0].legend()

sns.barplot(x=activities, y=[np.mean([true == pred for true, pred in zip([seq[0][1] for seq in test_sequences], [pred[0] for pred in predictions]) if true == activity_map[activity]]) for activity in activities], ax=ax[1])
ax[1].set_title('Activity Prediction Accuracy')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
