In [30]:
import pandas as pd
import numpy as np

In [31]:
df = pd.read_parquet('./data/processed/features_final.parquet')
df.head()

Unnamed: 0,student_id,oulad_avg_assessment_score,oulad_n_assessments,oulad_avg_clicks,oulad_active_days,registration_duration_days,score_w,score_norm,clicks_log,clicks_norm,...,correctness_num_ratings,attribution_majority_all_evidences,attribution_majority_all_steps,logic_majority_all_steps,agreement_majority_all_steps,is_low_agreement_hard_case,is_final_rated_evidence_for_step,answer_is_fully_attributable,answer_is_logically_correct,answer_is_fully_attributable_and_correct
0,11391,82.0,5.0,4.765306,40.0,1.354167e-12,82.0,0.792982,1.751858,0.764054,...,,,,,,,,,,
1,28400,66.400002,5.0,3.337209,80.0,1.354167e-12,66.400002,0.574035,1.467231,0.533008,...,,,,,,,,,,
2,30268,76.0,7.0,3.697368,12.0,1.203704e-12,76.0,0.708772,1.547002,0.597763,...,,,,,,,,,,
3,31604,76.0,5.0,3.254902,123.0,1.354167e-12,76.0,0.708772,1.448072,0.517456,...,,,,,,,,,,
4,32885,54.400002,5.0,2.9375,70.0,1.354167e-12,54.400002,0.405614,1.370546,0.454524,...,,,,,,,,,,


In [32]:
df['cognitive_efficiency'].value_counts

<bound method IndexOpsMixin.value_counts of 0        2.000000
1        0.819753
2        5.846154
3        0.612903
4        0.766197
           ...   
37628         NaN
37629         NaN
37630         NaN
37631         NaN
37632         NaN
Name: cognitive_efficiency, Length: 37633, dtype: float32>

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37633 entries, 0 to 37632
Columns: 108 entries, student_id to answer_is_fully_attributable_and_correct
dtypes: float32(106), string(2)
memory usage: 15.8 MB


In [34]:
df.columns

Index(['student_id', 'oulad_avg_assessment_score', 'oulad_n_assessments',
       'oulad_avg_clicks', 'oulad_active_days', 'registration_duration_days',
       'score_w', 'score_norm', 'clicks_log', 'clicks_norm',
       ...
       'correctness_num_ratings', 'attribution_majority_all_evidences',
       'attribution_majority_all_steps', 'logic_majority_all_steps',
       'agreement_majority_all_steps', 'is_low_agreement_hard_case',
       'is_final_rated_evidence_for_step', 'answer_is_fully_attributable',
       'answer_is_logically_correct',
       'answer_is_fully_attributable_and_correct'],
      dtype='object', length=108)

In [35]:
df.isnull().sum()

student_id                                     55
oulad_avg_assessment_score                   7867
oulad_n_assessments                          7867
oulad_avg_clicks                             7867
oulad_active_days                            7867
                                            ...  
is_low_agreement_hard_case                  32677
is_final_rated_evidence_for_step            32677
answer_is_fully_attributable                32677
answer_is_logically_correct                 32677
answer_is_fully_attributable_and_correct    32677
Length: 108, dtype: int64

In [36]:
null_summary = pd.DataFrame({
    "column": df.columns,
    "dtype": df.dtypes.values,
    "null_count": df.isnull().sum().values,
    "non_null_count": df.notnull().sum().values,
    "null_percentage": (df.isnull().mean() * 100).values
}).sort_values(by="null_percentage", ascending=False)

null_summary

Unnamed: 0,column,dtype,null_count,non_null_count,null_percentage
54,UD n.l,float32,34722,2911,92.264768
56,Pressure .,float32,34722,2911,92.264768
64,Pressure r,float32,34722,2911,92.264768
63,Pressure Caps,float32,34722,2911,92.264768
62,Pressure Shift.1,float32,34722,2911,92.264768
...,...,...,...,...,...
13,effort_norm,float32,7867,29766,20.904525
14,cognitive_efficiency,float32,7867,29766,20.904525
1,oulad_avg_assessment_score,float32,7867,29766,20.904525
0,student_id,string[python],55,37578,0.146148


In [37]:
missing_ratio = df.isnull().mean()

low_missing = missing_ratio[(missing_ratio >= 0) & (missing_ratio <= 0.30)].index.tolist()
mid_missing = missing_ratio[(missing_ratio > 0.30) & (missing_ratio <= 0.80)].index.tolist()
high_missing = missing_ratio[(missing_ratio > 0.80) & (missing_ratio <= 0.95)].index.tolist()
extreme_missing = missing_ratio[(missing_ratio > 0.95) ].index.tolist()

print("Low missing (0-30%): ", len(low_missing))
print("Mid missing (30-80%): ", len(mid_missing))
print("High missing (80-95%): ", len(high_missing))
print("Extreme missing (>95%): ", len(extreme_missing))

Low missing (0-30%):  16
Mid missing (30-80%):  0
High missing (80-95%):  92
Extreme missing (>95%):  0


In [38]:
numeric_cols = df.select_dtypes(
    include=["number"]
).columns.tolist()

In [39]:
low_missing_numeric = [
    c for c in low_missing
    if c in numeric_cols
]

In [40]:
df[low_missing_numeric] = (
    df[low_missing_numeric]
    .astype("float64")
    .replace({pd.NA: np.nan})
)

In [41]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor

In [42]:
imputer = IterativeImputer(
    estimator = ExtraTreesRegressor(n_estimators=200,random_state=42,n_jobs=1),
    max_iter=10,
    initial_strategy='constant',
    fill_value=1,
    random_state=0
)

In [43]:
df[low_missing_numeric] = imputer.fit_transform(df[low_missing_numeric])

In [44]:
for col in mid_missing:
    df[f"{col}_is_present"] = df[col].notnull().astype(int)
    df[col] = df[col].fillna(df[col].median())

In [45]:
for col in high_missing:
    df[f"{col}_is_present"] = df[col].notnull().astype(int)
    df[col] = df[col].fillna(0)

  df[f"{col}_is_present"] = df[col].notnull().astype(int)
  df[f"{col}_is_present"] = df[col].notnull().astype(int)
  df[f"{col}_is_present"] = df[col].notnull().astype(int)
  df[f"{col}_is_present"] = df[col].notnull().astype(int)
  df[f"{col}_is_present"] = df[col].notnull().astype(int)
  df[f"{col}_is_present"] = df[col].notnull().astype(int)
  df[f"{col}_is_present"] = df[col].notnull().astype(int)
  df[f"{col}_is_present"] = df[col].notnull().astype(int)
  df[f"{col}_is_present"] = df[col].notnull().astype(int)
  df[f"{col}_is_present"] = df[col].notnull().astype(int)
  df[f"{col}_is_present"] = df[col].notnull().astype(int)
  df[f"{col}_is_present"] = df[col].notnull().astype(int)
  df[f"{col}_is_present"] = df[col].notnull().astype(int)
  df[f"{col}_is_present"] = df[col].notnull().astype(int)
  df[f"{col}_is_present"] = df[col].notnull().astype(int)
  df[f"{col}_is_present"] = df[col].notnull().astype(int)
  df[f"{col}_is_present"] = df[col].notnull().astype(int)
  df[f"{col}_i

In [46]:
df.drop(columns=extreme_missing, inplace=True)

In [47]:
from lightgbm import LGBMRegressor
from sklearn.feature_selection import mutual_info_regression, RFE
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor

In [48]:
target = 'cognitive_efficiency'
id = 'student_id'

In [49]:
X = df.drop(columns=[id, target, 'dataset'])
y = df[target]

print('Initial feature count: ', X.shape[1])

Initial feature count:  197


In [50]:
#Layer 1 : Mutual Information
mi = mutual_info_regression(X, y, random_state=42)
mi_scores = pd.Series(mi, index=X.columns)

In [51]:
threshold = np.percentile(mi_scores, 70)   
selected_l1 = mi_scores[mi_scores >= threshold].index

X_l1 = X[selected_l1]

print("Layer 1 features:", X_l1.shape[1])

Layer 1 features: 59


In [63]:
#Layer 2 : Recursive Feature Elimination
rfe = RFE(
        estimator=LGBMRegressor(
            n_estimators=200,
            learning_rate=0.05,
            random_state=42,
            n_jobs=-1), n_features_to_select=30,
            step=0.1
)

rfe.fit(X_l1, y)

selected_l2 = X_l1.columns[rfe.support_]
X_l2 = X_l1[selected_l2]
print("Layer 2 features: ", X_l2.shape[1])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006026 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3330
[LightGBM] [Info] Number of data points in the train set: 37633, number of used features: 59
[LightGBM] [Info] Start training from score 2.928649
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005354 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3318
[LightGBM] [Info] Number of data points in the train set: 37633, number of used features: 54
[LightGBM] [Info] Start training from score 2.928649
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004551 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

In [65]:
from sklearn.linear_model import LassoCV

scaler = StandardScaler()
X_l2_scaled = scaler.fit_transform(X_l2)

lasso = LassoCV(
            alphas=np.logspace(-4, 0, 50),
            cv=5,
            random_state=42,
            n_jobs=-1
)

lasso.fit(X_l2_scaled, y)

lasso_coef = pd.Series(lasso.coef_, index=X_l2.columns)
selected_l3 = lasso_coef[lasso_coef !=0].index

X_l3 = X_l2[selected_l3]
print("Layer 3 features: ", X_l3.shape[1])


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Layer 3 features:  26


  model = cd_fast.enet_coordinate_descent(


In [66]:
#Layer 4: Boruta
from sklearn.ensemble import RandomForestRegressor

boruta = BorutaPy(
    estimator = RandomForestRegressor(n_estimators=500,
                                      random_state=42,
                                      n_jobs=-1),
                                      n_estimators='auto',
                                      perc=100,
                                      random_state=42
)

boruta.fit(X_l3.values, y.values)

selected_l4 = X_l3.columns[boruta.support_]
X_final = X_l3[selected_l4]

print("Final features: ", X_final.shape[1])

Final features:  5


In [69]:
X_final.isnull().sum()

oulad_avg_assessment_score    0
oulad_active_days             0
score_w                       0
score_norm                    0
days_norm                     0
dtype: int64

In [70]:
df1 = pd.read_parquet("./data/processed/features_final.parquet")

df1.shape

(37633, 108)

In [71]:
df1 = df1[df1["cognitive_efficiency"].notna()].reset_index(drop=True)

df1.shape

(29766, 108)

In [76]:
df1.columns

Index(['oulad_avg_assessment_score', 'oulad_n_assessments', 'oulad_avg_clicks',
       'oulad_active_days', 'registration_duration_days', 'score_w',
       'score_norm', 'clicks_log', 'clicks_norm', 'days_norm',
       ...
       'correctness_num_ratings', 'attribution_majority_all_evidences',
       'attribution_majority_all_steps', 'logic_majority_all_steps',
       'agreement_majority_all_steps', 'is_low_agreement_hard_case',
       'is_final_rated_evidence_for_step', 'answer_is_fully_attributable',
       'answer_is_logically_correct',
       'answer_is_fully_attributable_and_correct'],
      dtype='object', length=106)

In [73]:
drop_cols = [
    "student_id",   
    "dataset"       
]

df1.drop(columns=drop_cols, inplace=True, errors="ignore")

In [78]:
anfis_features = [
    "oulad_avg_assessment_score",
    "oulad_active_days",
    "score_w",
    "score_norm",
    "days_norm"
]

target = "cognitive_efficiency"

In [79]:
anfis_df = df1[anfis_features + [target].copy()]
anfis_df.shape

(29766, 6)

In [81]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [82]:
iter_imputer = IterativeImputer(
    estimator=ExtraTreesRegressor(
        n_estimators=200,
        random_state=42,
        n_jobs=-1),
        initial_strategy='constant',
        fill_value=0,
        max_iter=10,
        random_state=42)

In [83]:
anfis_df[anfis_features] = iter_imputer.fit_transform(
    anfis_df[anfis_features]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anfis_df[anfis_features] = iter_imputer.fit_transform(


In [84]:
anfis_df.isnull().sum()

oulad_avg_assessment_score    0
oulad_active_days             0
score_w                       0
score_norm                    0
days_norm                     0
cognitive_efficiency          0
dtype: int64

In [85]:
for col in anfis_features:
    lower = anfis_df[col].quantile(0.01)
    upper = anfis_df[col].quantile(0.99)
    anfis_df[col] = anfis_df[col].clip(lower, upper)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anfis_df[col] = anfis_df[col].clip(lower, upper)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anfis_df[col] = anfis_df[col].clip(lower, upper)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anfis_df[col] = anfis_df[col].clip(lower, upper)
A value is trying to be set on a copy of a slice from a Da

In [86]:
mm_scaler = MinMaxScaler(feature_range=(0, 1))

anfis_df[anfis_features] = mm_scaler.fit_transform(anfis_df[anfis_features])

anfis_df[anfis_features].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anfis_df[anfis_features] = mm_scaler.fit_transform(anfis_df[anfis_features])


Unnamed: 0,oulad_avg_assessment_score,oulad_active_days,score_w,score_norm,days_norm
count,29766.0,29766.0,29766.0,29766.0,29766.0
mean,0.675702,0.27594,0.675702,0.675702,0.276297
std,0.189608,0.231731,0.189608,0.189608,0.232
min,0.0,0.0,0.0,0.0,0.0
25%,0.596491,0.095923,0.596491,0.596491,0.096052
50%,0.708772,0.218007,0.708772,0.708772,0.2183
75%,0.796491,0.396773,0.796491,0.796491,0.397306
max,1.0,1.0,1.0,1.0,1.0


In [87]:
X = anfis_df[anfis_features].values
y = anfis_df[target].values

In [88]:
X_train_pool, X_test, y_train_pool, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_pool, y_train_pool, test_size=0.20, random_state=42)

X_train.shape, X_val.shape, X_test.shape

((19049, 5), (4763, 5), (5954, 5))

In [89]:
print("ANFIS PREPROCESSING CHECK")
print("-------------------------")
print("Min value:", X_train.min())
print("Max value:", X_train.max())
print("NaNs:", np.isnan(X_train).sum())
print("Features:", anfis_features)

ANFIS PREPROCESSING CHECK
-------------------------
Min value: 0.0
Max value: 1.0
NaNs: 0
Features: ['oulad_avg_assessment_score', 'oulad_active_days', 'score_w', 'score_norm', 'days_norm']


In [90]:
#ANFIS Architecture: Layer 1 - Fuzzification and membership function
def gaussian_mf(x, mean, sigma):
    return np.exp(-0.5 * ((x - mean)/sigma) ** 2)

In [91]:
#ANFIS Architecture: Layer 2 - Rule Firing Strength
def compute_rule_strength(x, mf_params, n_mfs):
    strength = 1.0
    tmp = x 
    for i in range(len(mf_params)):
        mf_idx = tmp % n_mfs
        tmp //= n_mfs
        params = mf_params[i][mf_idx]
        strength *= gaussian_mf(x[i], params['mean'], params['sigma'])

    return strength


In [92]:
#ANFIS Architecture: Layer 3 - Normalization
def normalize_firing_strength(W):
    return W / (np.sum(W, axis=1, keepdims=True) + 1e-8)

In [93]:
#ANFIS Architecture: Layer 4 - Consequent Layer
def build_consequent_matrix(X, W):
    X_aug = np.hstack([X, np.ones((X.shape[0], 1))])
    Phi = []

    for i in range(X.shape[0]):
        row = []
        for i in range(W.shape[1]):
            row.extend(w[i, r] * X_aug[i])
        Phi.append(row)

    return np.array(Phi)

In [94]:
#ANFIS Architecture: Layer 5 - Defuzzification
def compute_output(Phi, consequents):
    return Phi @ consequents

In [None]:
class SimpleANFIS:
    def __init__(self, n_inputs, n_mfs=3):
        self.n_inputs = n_inputs
        self.n_mfs = n_mfs
        self.rules = n_mfs ** n_inputs

        self.mf_params = {
            i: {
                j: {
                    "mean": np.random.uniform(0, 1),
                    "sigma": np.random.uniform(0.1, 0.3)
                }
                for j in range(n_mfs)
            }
            for i in range(n_inputs)
        }

        self.consequents =

In [None]:
#Layer 1 - 3
def forward(self, X):
    firing = []

    for x in X:
        rule_strengths = []
        for rule in range(self.rules):
            strength = 1.0
            tmp = rule
            for i in range(self.n_inputs):
                mf_idx = tmp % self.n_mfs
                tmp //= self.n_mfs
                params = self.mf_params[i][mf_idx]
                strength *= gaussian_mf(x[i], params["mean", params["sigma"]])
            rule_strengths.append(strength)
        firing.append(rule_strengths)
    firing = np.array(firing)
    return normalize_firing_strength(firing)


In [None]:
#TRAIN (Layer 4 learning via Least Squares)
def fit(self, X, y):
    W = self.forward(X)
    Phi = build_consequent_matrix(X, W)
    self.consequents = np.linalg.pinv(Phi) @ y

In [None]:
#LAYER 5
def predict(self, X):
    W = self.forward(X)
    Phi = build_consequent_matrix(X, W)
    return compute_output(Phi, self.consequents)

In [121]:
[m for m in dir(ANFIS_CE) if not m.startswith("__")]

[]

In [105]:
anfis = SimpleANFIS(
    n_inputs=X_train.shape[1],
    n_mfs=3
)

anfis.fit(X_train, y_train)

y_val_pred = anfis.predict(X_val)
y_test_pred = anfis.predict(X_test)

AttributeError: 'SimpleANFIS' object has no attribute 'fit'

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

print("ANFIS Validation")
print("MAE:", mean_absolute_error(y_val, y_val_pred))
print("R2 :", r2_score(y_val, y_val_pred))

print("\nANFIS Test")
print("MAE:", mean_absolute_error(y_test, y_test_pred))
print("R2 :", r2_score(y_test, y_test_pred))