In [1]:
from pymongo import MongoClient
import pandas as pd
import threading
from queue import Queue

MONGO_HOST = 'localhost'
MONGO_PORT = 27017
MONGO_DB = 'admin'
MONGO_USERNAME = 'root'
MONGO_PASSWORD = 'mongo_password'

def get_database():
    uri = f"mongodb://{MONGO_USERNAME}:{MONGO_PASSWORD}@{MONGO_HOST}:{MONGO_PORT}/"
    client = MongoClient(uri)
    return client[MONGO_DB]

def fetch_data_from_mongodb():
    db = get_database()
    print("Fetching activity_log data...")
    activity_cursor = db["activity_log"].find({}, {"User_ID": 1, "Component": 1, "_id": 0})
    activity_data = pd.DataFrame(list(activity_cursor))

    user_cursor = db["user_log"].find({}, {"User_ID": 1, "Month": 1, "_id": 0})
    user_data = pd.DataFrame(list(user_cursor))

    return activity_data, user_data

def threaded_merge_data(activity_data, user_data):
    q = Queue()
    def merge_task(a, u, q):
        # MERGE
        merged_data = pd.merge(a, u, on="User_ID", how="inner")
        q.put(merged_data)
    merge_thread = threading.Thread(target=merge_task, args=(activity_data, user_data, q))
    merge_thread.start()
    merge_thread.join()
    merged_data = q.get()
    return merged_data

def clean_data(merged_data):
    # REMOVE
    merged_data = merged_data[~merged_data["Component"].isin(["System", "Folder"])]
    merged_data.dropna(inplace=True)
    return merged_data

def count_interactions(merged_data):
    print("Counting interactions per User_ID, Month, and Component...")
    count_data = merged_data.groupby(["User_ID", "Month", "Component"]).size().reset_index(name='Count')
    return count_data

def reshape_data(count_data):
    print("Reshaping data into pivot format...")
    pivot_data = count_data.pivot_table(index=["User_ID", "Month"], columns="Component", values="Count", fill_value=0)
    return pivot_data


# MAIN
activity_data, user_data = fetch_data_from_mongodb()
merged_data = threaded_merge_data(activity_data, user_data)
merged_data = clean_data(merged_data)

# COUNT
count_data = count_interactions(merged_data)

# Before RESHAPE
print("\n=== Before Reshape (count_data, first 5 rows) ===")
print(count_data.head())

pivot_data = reshape_data(count_data)

# After RESHAPE
print("\n=== After Reshape (pivot_data, first 5 rows) ===")
print(pivot_data.head())

Fetching activity_log data...
Starting threaded data merge...
Cleaning merged data: removing 'System' and 'Folder' and NaNs...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data.dropna(inplace=True)


Counting interactions per User_ID, Month, and Component...

=== Before Reshape (count_data, first 5 rows) ===
   User_ID  Month   Component  Count
0        1      9  Assignment   1350
1        1      9  Attendence     10
2        1      9        Book     10
3        1      9      Course    620
4        1      9     Lecture    400
Reshaping data into pivot format...

=== After Reshape (pivot_data, first 5 rows) ===
Component      Assignment  Attendence   Book   Course  Feedback  Lecture  \
User_ID Month                                                              
1       9          1350.0        10.0   10.0    620.0       0.0    400.0   
        10        47250.0       350.0  350.0  21700.0       0.0  14000.0   
        11        42795.0       317.0  317.0  19654.0       0.0  12680.0   
        12         6885.0        51.0   51.0   3162.0       0.0   2040.0   
2       9          3458.0         0.0   28.0    924.0       0.0    518.0   

Component      Manual   Page  Project  Questionna