In [None]:
from pymongo import MongoClient
import pandas as pd
import threading
from queue import Queue

MONGO_HOST = 'localhost'
MONGO_PORT = 27017
MONGO_DB = 'admin'
MONGO_USERNAME = 'root'
MONGO_PASSWORD = 'mongo_password'

def get_database():
    uri = f"mongodb://{MONGO_USERNAME}:{MONGO_PASSWORD}@{MONGO_HOST}:{MONGO_PORT}/"
    client = MongoClient(uri)
    return client[MONGO_DB]

def fetch_data_from_mongodb():
    db = get_database()
    activity_cursor = db["activity_log"].find({}, {"User_ID": 1, "Component": 1, "_id": 0})
    activity_data = pd.DataFrame(list(activity_cursor))

    user_cursor = db["user_log"].find({}, {"User_ID": 1, "Month": 1, "_id": 0})
    user_data = pd.DataFrame(list(user_cursor))

    return activity_data, user_data

def threaded_merge_data(activity_data, user_data):
    q = Queue()
    def merge_task(a, u, q):
        # MERGE
        print("Starting threaded data merge...")
        merged_data = pd.merge(a, u, on="User_ID", how="inner")
        q.put(merged_data)
    merge_thread = threading.Thread(target=merge_task, args=(activity_data, user_data, q))
    merge_thread.start()
    merge_thread.join()
    merged_data = q.get()
    return merged_data

def clean_data(merged_data):
    # REMOVE
    merged_data = merged_data[~merged_data["Component"].isin(["System", "Folder"])]
    # managing missing values
    merged_data.dropna(inplace=True)
    return merged_data

def count_interactions(merged_data):
    count_data = merged_data.groupby(["User_ID", "Month", "Component"]).size().reset_index(name='Count')
    return count_data

def reshape_data(count_data):
    # RESHAPE using pivot operation
    pivot_data = count_data.pivot_table(index=["User_ID", "Month"], columns="Component", values="Count", fill_value=0)
    return pivot_data

def save_pivot_data_to_mongo(pivot_data):
    db = get_database()
    records = pivot_data.reset_index().to_dict('records')
    db["count_data"].drop()
    db["count_data"].insert_many(records)
    print("Pivot data saved to 'count_data' collection in MongoDB.")


# MAIN
activity_data, user_data = fetch_data_from_mongodb()
merged_data = threaded_merge_data(activity_data, user_data)
merged_data = clean_data(merged_data)

# COUNT
count_data = count_interactions(merged_data)
pivot_data = reshape_data(count_data)
save_pivot_data_to_mongo(pivot_data)