In [98]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [99]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

In [100]:
data = pd.read_excel("../data/raw/Dataset of AI Adoption Usage among Students in Indonesia Higher Education.xlsx")

In [101]:
df = data[:535]

In [102]:
print(df.isnull().sum())

Respondent            0
Gender                0
University            0
Level of Education    0
Province              0
Fields of Study       0
Type of AI            0
PE1                   0
PE2                   0
PE3                   0
PE4                   0
PE5                   0
PE6                   0
PE7                   0
PE8                   1
PE9                   1
PE10                  1
PE11                  2
PE12                  1
PE13                  1
PE14                  1
PE15                  1
PE16                  1
PE17                  2
PE18                  2
PE19                  4
PE20                  2
CU1                   1
CU2                   1
CU3                   1
CU4                   2
ATU1                  1
ATU2                  2
ATU3                  2
ATU4                  2
ATU5                  1
AUP1                  1
AUP2                  1
AUP3                  1
AUP4                  1
AUP5                  1
MIUA1           

In [103]:
print(df.duplicated().sum())

0


In [104]:
#check outliers 
scale_cols = [c for c in df.columns if c.startswith(("PE","CU","ATU","AUP","MIUA"))]
for col in scale_cols:
    print(col, df[col].min(), df[col].max())

PE1 1.0 4.0
PE2 1.0 4.0
PE3 1.0 4.0
PE4 1.0 4.0
PE5 1.0 4.0
PE6 1.0 4.0
PE7 1.0 4.0
PE8 1.0 4.0
PE9 1.0 4.0
PE10 1.0 4.0
PE11 1.0 4.0
PE12 1.0 4.0
PE13 1.0 4.0
PE14 1.0 4.0
PE15 1.0 4.0
PE16 1.0 4.0
PE17 1.0 4.0
PE18 1.0 4.0
PE19 1.0 4.0
PE20 1.0 4.0
CU1 1.0 4.0
CU2 1.0 4.0
CU3 1.0 4.0
CU4 1.0 4.0
ATU1 1.0 4.0
ATU2 1.0 4.0
ATU3 1.0 5.0
ATU4 1.0 4.0
ATU5 1.0 5.0
AUP1 1.0 4.0
AUP2 1.0 4.0
AUP3 1.0 4.0
AUP4 1.0 4.0
AUP5 1.0 4.0
MIUA1 1.0 4.0
MIUA2 1.0 4.0
MIUA3 1.0 4.0
MIUA4 1.0 4.0
MIUA5 1.0 4.0


In [105]:
df = df.dropna()

In [106]:
from utils.mapping import gender_map, university_map, education_map, province_map, field_map, ai_map
df["Gender_Label"] = df["Gender"].map(gender_map)
df["University_Label"] = df["University"].map(university_map)
df["Education_Label"] = df["Level of Education"].map(education_map)
df["Province_Label"] = df["Province"].map(province_map)
df["Field_Label"] = df["Fields of Study"].map(field_map)
df["AI_Label"] = df["Type of AI"].map(ai_map)


In [107]:
df["PE_Score"]   = df[[c for c in df.columns if c.startswith("PE")]].mean(axis=1)
df["CU_Score"]   = df[[c for c in df.columns if c.startswith("CU")]].mean(axis=1)
df["ATU_Score"]  = df[[c for c in df.columns if c.startswith("ATU")]].mean(axis=1)
df["AUP_Score"]  = df[[c for c in df.columns if c.startswith("AUP")]].mean(axis=1)
df["MIUA_Score"] = df[[c for c in df.columns if c.startswith("MIUA")]].mean(axis=1)

In [108]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[["PE_Score","CU_Score","ATU_Score","AUP_Score","MIUA_Score"]])

scaled_df = pd.DataFrame(df_scaled, columns=["PE_S","CU_S","ATU_S","AUP_S","MIUA_S"])
df = pd.concat([df, scaled_df], axis=1)

In [109]:
df.to_csv("../data/processed/clean_dataset.csv", index=False)