In [1]:
import sys
import os
import pandas as pd
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

In [2]:
from utils.preprocessing import preprocess_data, clean_for_clustering
from analysis.clustering import elbow_method, silhouette_scan, fit_kmeans, cluster_profile, save_cluster_profile
from analysis.statistical_modeling import run_anova, run_tukey, run_ols
from visualization.plots import plot_distribution, plot_crosstab_heatmap, plot_correlation

In [3]:
df = preprocess_data("../data/raw/Dataset of AI Adoption Usage among Students in Indonesia Higher Education.xlsx", "../data/processed/clean_dataset_pipeline.csv")

In [4]:
X = clean_for_clustering(df, use_scaled=True).values
elbow_df = elbow_method(X, k_min=2, k_max=8, fig_name="elbow_composites", reports_base="../reports")
print(silhouette_scan(X, 2, 6))

km, labels = fit_kmeans(X, k=2)
profile = cluster_profile(df, labels)
save_cluster_profile(profile, name="cluster_profile_k2", reports_base="../reports")

   k  silhouette
0  2    0.307700
1  3    0.296609
2  4    0.278916
3  5    0.268974
4  6    0.235206


'../reports\\tables\\cluster_profile_k2.csv'

In [5]:
# ANOVA: PE_Score ~ Gender
res_anova = run_anova(df, "PE_Score ~ C(Gender_Label)", name="anova_pe_gender", reports_base="../reports")
# Tukey
res_tukey = run_tukey(df, dv="PE_Score", group="Gender_Label", name="tukey_pe_gender", reports_base="../reports")
# OLS regression
res_ols = run_ols(df, "AUP_Score ~ PE_Score + CU_Score + ATU_Score", name="ols_aup", reports_base="../reports")

In [6]:
plot_distribution(df, "AI_Label", title="AI Tools Used", name="ai_tools_used", reports_base="../reports")
plot_crosstab_heatmap(df, "Gender_Label", "AI_Label", name="gender_vs_ai", reports_base="../reports")
plot_correlation(df, ["PE_Score","CU_Score","ATU_Score","AUP_Score","MIUA_Score"], name="corr_composites", reports_base="../reports")

'../reports\\figures\\corr_composites.png'