## Setup

In [None]:
!pip install pandas
!pip install pymongo
!pip install ijson

## Import libraries

In [1]:
import sys
sys.path.append('../')

import pandas as pd
import os, json, re, pprint
from bson import json_util
import ijson
import time
from configs.config import data as data_path, cache as cache_path

## Assign input and output data paths

In [2]:
in_file_name = "cleaned_activities.json"
out_file_name = "activities.csv"
out_path = cache_path + out_file_name

# Prepare data

* Read the cleaned json data
* read it in a stream format as the data is extremely big
* select relevant fields
* save to data/prep folder

In [3]:
def read_json_to_df(data_path, in_file_name):
    with open(data_path + in_file_name, 'r') as json_file:
        # Use a list comprehension to directly construct the list of parsed JSON objects
        json_data = [obj for obj in ijson.items(json_file, 'item')]

    return pd.DataFrame(json_data)

## Read raw data

In [4]:
adf = read_json_to_df(data_path, in_file_name)

## Convert raw data into desired format

In [10]:
# activities
acols = ["event_time", "activity", "case", "task", "processing_time"]

# Setting default values for columns
adf['case'] = adf['unique_id'].apply(lambda x: x.get('CaseID', 'Missing') if isinstance(x, dict) else "Missing")
adf['task'] = adf['activity_specifications'].apply(lambda x: x.get('task_name', 'Missing') if isinstance(x, dict) else "Missing")
adf['activity'] = adf['activity_specifications'].apply(lambda x: x.get('activity_alias_name', 'Missing') if isinstance(x, dict) else "Missing")
adf['processing_time'] = adf['processing_time'].fillna(1, inplace=True) if 'processing_time' in adf.columns else 1

# Convert 'event_time' to datetime
date_format = '%Y-%m-%d %H:%M:%S'  # Change this to your desired date format
adf['event_time'] = pd.to_datetime(adf['event_time'].fillna(pd.to_datetime('now'))).dt.strftime(date_format)

## Save tranformed raw data to activities.csv file

In [None]:
# check if path exists, if not create it
if not os.path.exists(cache_path):
    os.makedirs(cache_path)
# save the activities to csv
adf[acols].to_csv(cache_path + out_file_name, index=False)