In [3]:
import pandas as pd
from pymongo import MongoClient

# DB setting
MONGO_HOST = 'localhost'
MONGO_PORT = 27017
MONGO_DB = 'admin'
MONGO_USERNAME = 'root'
MONGO_PASSWORD = 'mongo_password'

def get_database():
    uri = f"mongodb://{MONGO_USERNAME}:{MONGO_PASSWORD}@{MONGO_HOST}:{MONGO_PORT}/"
    client = MongoClient(uri)
    return client[MONGO_DB]

def load_pivot_data_from_mongo():
    db = get_database()
    data = list(db["count_data"].find({}, {"_id": 0}))
    pivot_df = pd.DataFrame(data)
    if "User_ID" in pivot_df.columns and "Month" in pivot_df.columns:
        pivot_df.set_index(["User_ID", "Month"], inplace=True)
    return pivot_df

def evaluate_thresholds(df, components, thresholds=[0.1, 0.15, 0.2]):
    results = {}
    for comp in components:
        if comp in df.columns:
            results[comp] = {}
            for t in thresholds:
                q_low = df[comp].quantile(t)
                q_high = df[comp].quantile(1 - t)
                retained = df[(df[comp] >= q_low) & (df[comp] <= q_high)]
                retention_rate = len(retained) / len(df)
                results[comp][f"{int(t*100)}-{int((1-t)*100)}%"] = f"{retention_rate:.2%}"
    return results

pivot_data = load_pivot_data_from_mongo()
correlation_components = ["Assignment", "Quiz", "Lecture", "Book", "Project", "Course"]
thresholds_results = evaluate_thresholds(pivot_data, correlation_components)
print("Data Retention Rate at Different Thresholds:")
print(pd.DataFrame(thresholds_results))


Data Retention Rate at Different Thresholds:
       Assignment    Quiz Lecture    Book Project  Course
10-90%     79.86%  80.04%  79.86%  79.86%  91.17%  79.86%
15-85%     69.96%  69.96%  70.14%  69.96%  91.17%  69.96%
20-80%     60.07%  60.07%  60.07%  60.07%  91.17%  60.07%
