In [4]:
from pymongo import MongoClient
import pandas as pd
import threading
from queue import Queue

MONGO_HOST = 'localhost'
MONGO_PORT = 27017
MONGO_DB = 'admin'
MONGO_USERNAME = 'root'
MONGO_PASSWORD = 'mongo_password'

def get_database():
    uri = f"mongodb://{MONGO_USERNAME}:{MONGO_PASSWORD}@{MONGO_HOST}:{MONGO_PORT}/"
    client = MongoClient(uri)
    return client[MONGO_DB]

def fetch_data_from_mongodb():
    db = get_database()
    print("Fetching activity_log data...")
    activity_cursor = db["activity_log"].find({}, {"User_ID": 1, "Component": 1, "_id": 0})
    activity_data = pd.DataFrame(list(activity_cursor))

    user_cursor = db["user_log"].find({}, {"User_ID": 1, "Month": 1, "_id": 0})
    user_data = pd.DataFrame(list(user_cursor))

    return activity_data, user_data

def threaded_merge_data(activity_data, user_data):
    q = Queue()
    def merge_task(a, u, q):
        # MERGE
        print("Starting threaded data merge...")
        merged_data = pd.merge(a, u, on="User_ID", how="inner")
        q.put(merged_data)
    merge_thread = threading.Thread(target=merge_task, args=(activity_data, user_data, q))
    merge_thread.start()
    merge_thread.join()
    merged_data = q.get()
    return merged_data

def clean_data(merged_data):
    # REMOVE
    print("Cleaning merged data: removing unnecessary rows and NaNs...")
    merged_data = merged_data[~merged_data["Component"].isin(["System", "Folder"])]
    merged_data.dropna(inplace=True)
    return merged_data

def count_interactions(merged_data):
    print("Counting interactions per User_ID, Month, and Component...")
    count_data = merged_data.groupby(["User_ID", "Month", "Component"]).size().reset_index(name='Count')
    return count_data

def reshape_data(count_data):
    print("Reshaping data into pivot format...")
    # RESHAPE
    pivot_data = count_data.pivot_table(index=["User_ID", "Month"], columns="Component", values="Count", fill_value=0)
    return pivot_data

def save_pivot_data_to_mongo(pivot_data):
    db = get_database()
    records = pivot_data.reset_index().to_dict('records')
    db["count_data"].drop()
    db["count_data"].insert_many(records)
    print("Pivot data saved to 'count_data' collection in MongoDB.")


# MAIN
activity_data, user_data = fetch_data_from_mongodb()
merged_data = threaded_merge_data(activity_data, user_data)
merged_data = clean_data(merged_data)

# COUNT
count_data = count_interactions(merged_data)
pivot_data = reshape_data(count_data)
save_pivot_data_to_mongo(pivot_data)

Fetching activity_log data...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data.dropna(inplace=True)


Pivot data saved to 'count_data' collection in MongoDB.


In [6]:
pip install scipy

Collecting scipy
  Downloading scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl.metadata (60 kB)
Downloading scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl (30.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.3/30.3 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: scipy
Successfully installed scipy-1.13.1
Note: you may need to restart the kernel to use updated packages.


In [10]:
import tkinter as tk
from tkinter import ttk

def calculate_statistics(pivot_data, target_components):
    # a. For each month
    stats_by_month = {}
    for month, group in pivot_data.groupby(level='Month'):
        month_stats = {}
        for comp in target_components:
            if comp in group.columns:
                comp_values = group[comp].dropna()
                # mean
                comp_mean = comp_values.mean() if not comp_values.empty else None
                # median
                comp_median = comp_values.median() if not comp_values.empty else None
                # mode
                val_counts = comp_values.value_counts()
                comp_mode = val_counts.index[0] if not val_counts.empty else None

                month_stats[comp] = {
                    "mean": comp_mean,
                    "median": comp_median,
                    "mode": comp_mode
                }
            else:
                month_stats[comp] = {"mean": None, "median": None, "mode": None}
        stats_by_month[month] = month_stats

    # b. For the entire 13-week academic semester
    all_data_stats = {}
    all_data = pivot_data.reset_index(drop=False)
    for comp in target_components:
        if comp in all_data.columns:
            comp_values = all_data[comp].dropna()
            
            comp_mean = comp_values.mean() if not comp_values.empty else None
            comp_median = comp_values.median() if not comp_values.empty else None
            
            val_counts = comp_values.value_counts()
            comp_mode = val_counts.index[0] if not val_counts.empty else None
            
            all_data_stats[comp] = {
                "mean": comp_mean,
                "median": comp_median,
                "mode": comp_mode
            }
        else:
            all_data_stats[comp] = {"mean": None, "median": None, "mode": None}

    return stats_by_month, all_data_stats

# OUTPUT STATISTICS
target_components = ["Quiz", "Lecture", "Assignment", "Attendance", "Survey"]
stats_by_month, all_data_stats = calculate_statistics(pivot_data, target_components)


######### GUI #########

def show_statistics():
    selected = month_var.get()
    for i in stat_tree.get_children():
        stat_tree.delete(i)
    
    if selected == "Entire Semester":
        data_source = all_data_stats
    else:
        try:
            month = int(selected)
            data_source = stats_by_month.get(month, {})
        except ValueError:
            data_source = {}
    
    for comp in target_components:
        comp_stats = data_source.get(comp, {"mean": None, "median": None, "mode": None})
        stat_tree.insert("", "end", values=(comp, comp_stats["mean"], comp_stats["median"], comp_stats["mode"]))

    feedback_label.config(text="Statistics updated.", fg="green")

root = tk.Tk()
root.title("Output Statistics")

frame = tk.Frame(root)
frame.pack(pady=10, padx=10)

tk.Label(frame, text="Select Month:").grid(row=0, column=0, padx=5, pady=5, sticky="e")

month_options = list(stats_by_month.keys())
month_options_str = [str(m) for m in month_options]
month_options_str.append("Entire Semester")

month_var = tk.StringVar(value="Entire Semester")
month_combo = ttk.Combobox(frame, textvariable=month_var, values=month_options_str, state="readonly")
month_combo.grid(row=0, column=1, padx=5, pady=5, sticky="w")

show_btn = tk.Button(frame, text="Show Statistics", command=show_statistics)
show_btn.grid(row=0, column=2, padx=5, pady=5)

stat_columns = ("Component", "Mean", "Median", "Mode")
stat_tree = ttk.Treeview(root, columns=stat_columns, show="headings")
for col in stat_columns:
    stat_tree.heading(col, text=col)
stat_tree.pack(pady=10, padx=10, fill="x")

feedback_label = tk.Label(root, text="")
feedback_label.pack(pady=5)

root.mainloop()
