In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/student-demographics-online-education-dataoulad/studentRegistration.csv
/kaggle/input/student-demographics-online-education-dataoulad/studentAssessment.csv
/kaggle/input/student-demographics-online-education-dataoulad/studentInfo.csv
/kaggle/input/student-demographics-online-education-dataoulad/studentVle.csv
/kaggle/input/student-demographics-online-education-dataoulad/vle.csv
/kaggle/input/student-demographics-online-education-dataoulad/assessments.csv
/kaggle/input/student-demographics-online-education-dataoulad/courses.csv


In [2]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

## Load OULAD Datasets

In this step, we load all CSV files of the OULAD dataset into a dictionary called `dfs`.
Each key in the dictionary corresponds to a dataset name (e.g., 'studentInfo', 'courses', etc.), 
and the value is the pandas DataFrame.

In [3]:
student_info = pd.read_csv("/kaggle/input/student-demographics-online-education-dataoulad/studentInfo.csv")
student_reg = pd.read_csv("/kaggle/input/student-demographics-online-education-dataoulad/studentRegistration.csv")
student_assess = pd.read_csv("/kaggle/input/student-demographics-online-education-dataoulad/studentAssessment.csv")
student_vle = pd.read_csv("/kaggle/input/student-demographics-online-education-dataoulad/studentVle.csv")
assessments = pd.read_csv("/kaggle/input/student-demographics-online-education-dataoulad/assessments.csv")
vle = pd.read_csv("/kaggle/input/student-demographics-online-education-dataoulad/vle.csv")
courses = pd.read_csv("/kaggle/input/student-demographics-online-education-dataoulad/courses.csv")

print("\nDataset Shapes")
dfs = {
    "student_info": student_info,
    "student_reg": student_reg,
    "student_assess": student_assess,
    "student_vle": student_vle,
    "assessments": assessments,
    "vle": vle,
    "courses": courses
}

for name, df in dfs.items():
    print(f"{name}: {df.shape}")


Dataset Shapes
student_info: (32593, 12)
student_reg: (32593, 5)
student_assess: (173912, 5)
student_vle: (10655280, 6)
assessments: (206, 6)
vle: (6364, 6)
courses: (22, 3)


## Check Missing Values in All Tables
Before performing any cleaning or feature engineering, we need to understand data quality.
This step prints the total number of missing values in every dataset.

In [4]:
for name, df in dfs.items():
    print(f"\nMissing values in {name}:")
    print(df.isna().sum())


Missing values in student_info:
code_module                0
code_presentation          0
id_student                 0
gender                     0
region                     0
highest_education          0
imd_band                1111
age_band                   0
num_of_prev_attempts       0
studied_credits            0
disability                 0
final_result               0
dtype: int64

Missing values in student_reg:
code_module                0
code_presentation          0
id_student                 0
date_registration         45
date_unregistration    22521
dtype: int64

Missing values in student_assess:
id_assessment       0
id_student          0
date_submitted      0
is_banked           0
score             173
dtype: int64

Missing values in student_vle:
code_module          0
code_presentation    0
id_student           0
id_site              0
date                 0
sum_click            0
dtype: int64

Missing values in assessments:
code_module           0
code_presentation  

## Handle Missing Values

We now address missing values across all datasets.
Each dataset requires a different strategy because the columns have different meanings.

In [5]:
student_info['imd_band'] = student_info['imd_band'].fillna(student_info['imd_band'].mode()[0])
student_reg['date_registration'] = student_reg['date_registration'].fillna(student_reg['date_registration'].median())
student_reg['date_unregistration'] = student_reg['date_unregistration'].fillna(-1)
student_assess['score'] = student_assess['score'].fillna(0)
assessments['date'] = assessments['date'].fillna(assessments['date'].median())
vle['week_from'] = vle['week_from'].fillna(-1)
vle['week_to'] = vle['week_to'].fillna(-1)

## Verify Missing Values After Cleaning

After handling missing values, we re-check each dataset to ensure that all NA values were processed correctly.

In [6]:
for name, df in dfs.items():
    print(f"\nMissing values in {name} after cleaning:")
    print(df.isna().sum())


Missing values in student_info after cleaning:
code_module             0
code_presentation       0
id_student              0
gender                  0
region                  0
highest_education       0
imd_band                0
age_band                0
num_of_prev_attempts    0
studied_credits         0
disability              0
final_result            0
dtype: int64

Missing values in student_reg after cleaning:
code_module            0
code_presentation      0
id_student             0
date_registration      0
date_unregistration    0
dtype: int64

Missing values in student_assess after cleaning:
id_assessment     0
id_student        0
date_submitted    0
is_banked         0
score             0
dtype: int64

Missing values in student_vle after cleaning:
code_module          0
code_presentation    0
id_student           0
id_site              0
date                 0
sum_click            0
dtype: int64

Missing values in assessments after cleaning:
code_module          0
code_presen

## Merge Core Tables

Now that all datasets are clean and have no missing values, we begin merging the core tables.
We start by merging `student_info` with `student_reg` using the shared keys:
- code_module  
- code_presentation  
- id_student  

This will create a unified student-level table that includes demographics and registration details.

In [7]:
merged_df = pd.merge(
    student_info,
    student_reg,
    on=["code_module", "code_presentation", "id_student"],
    how="left"
)

merged_df.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,date_registration,date_unregistration
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass,-159.0,-1.0
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass,-53.0,-1.0
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn,-92.0,12.0
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass,-52.0,-1.0
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass,-176.0,-1.0


## Explore Merged Dataset

Now that we have combined the key student information with registration dates,  
we perform a basic exploration to understand the dataset structure, data types,  
and detect any potential issues before feature engineering.

In [8]:
merged_df.info()
print("\nSummary Statistics:")
print(merged_df.describe(include='all'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32593 entries, 0 to 32592
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   code_module           32593 non-null  object 
 1   code_presentation     32593 non-null  object 
 2   id_student            32593 non-null  int64  
 3   gender                32593 non-null  object 
 4   region                32593 non-null  object 
 5   highest_education     32593 non-null  object 
 6   imd_band              32593 non-null  object 
 7   age_band              32593 non-null  object 
 8   num_of_prev_attempts  32593 non-null  int64  
 9   studied_credits       32593 non-null  int64  
 10  disability            32593 non-null  object 
 11  final_result          32593 non-null  object 
 12  date_registration     32593 non-null  float64
 13  date_unregistration   32593 non-null  float64
dtypes: float64(2), int64(3), object(9)
memory usage: 3.5+ MB

Summary Stat

## Aggregate Assessment and VLE Tables

Before merging everything into one modeling dataset, we must aggregate the
large tables (`student_assess` and `student_vle`) into student-level features.

This reduces tens of millions of records into manageable feature tables.

We will create:
- Total assessment score per student
- Average assessment score
- Number of assessments attempted
- Total VLE clicks per student

In [9]:
assess_agg = student_assess.groupby('id_student').agg(
    total_score=('score', 'sum'),
    avg_score=('score', 'mean'),
    num_assessments=('score', 'count')
).reset_index()

vle_agg = student_vle.groupby('id_student').agg(
    total_clicks=('sum_click', 'sum')
).reset_index()

print(assess_agg.head())
print(vle_agg.head())

   id_student  total_score  avg_score  num_assessments
0        6516        309.0  61.800000                5
1        8462        609.0  87.000000                7
2       11391        410.0  82.000000                5
3       23629        330.0  82.500000                4
4       23698        670.0  74.444444                9
   id_student  total_clicks
0        6516          2791
1        8462           656
2       11391           934
3       23629           161
4       23698           910


## Now that we have created aggregated features from the Assessment and VLE datasets,
we merge them into the main student_info table.
This gives us one consolidated dataset containing demographics, registration details,
assessment performance, and online activity metrics for each student.

In [10]:
assess_features = (
    student_assess
    .groupby("id_student")
    .agg(
        total_score=("score", "sum"),
        avg_score=("score", "mean"),
        num_assessments=("id_assessment", "count")
    )
    .reset_index()
)

In [11]:
vle_features = (
    student_vle
    .groupby("id_student")
    .agg(
        total_clicks=("sum_click", "sum")
    )
    .reset_index()
)

In [12]:
student_info = student_info.merge(assess_features, on="id_student", how="left")

student_info = student_info.merge(vle_features, on="id_student", how="left")

student_info['total_score'] = student_info['total_score'].fillna(0)
student_info['avg_score'] = student_info['avg_score'].fillna(0)
student_info['num_assessments'] = student_info['num_assessments'].fillna(0)
student_info['total_clicks'] = student_info['total_clicks'].fillna(0)

## Create ML Targets (Classification + Regression)

We convert the `final_result` column into:
- `target_pass` ‚Üí binary classification (1 = Pass/Distinction, 0 = Fail/Withdrawn)
- `target_cgpa` ‚Üí ordinal regression scale (0‚Äì3), similar to CGPA grading

In [13]:
student_info['target_pass'] = student_info['final_result'].map({
    'Pass': 1,
    'Distinction': 1,
    'Fail': 0,
    'Withdrawn': 0
})

student_info['target_cgpa'] = student_info['final_result'].map({
    'Distinction': 3,
    'Pass': 2,
    'Fail': 1,
    'Withdrawn': 0
})

student_info[['final_result', 'target_pass', 'target_cgpa']].head(20)

Unnamed: 0,final_result,target_pass,target_cgpa
0,Pass,1,2
1,Pass,1,2
2,Withdrawn,0,0
3,Pass,1,2
4,Pass,1,2
5,Pass,1,2
6,Pass,1,2
7,Pass,1,2
8,Pass,1,2
9,Pass,1,2


## Merge student_info with student_reg

student_info does not contain registration dates.  
To create registration-based features, we must merge student_info and student_registration on:
- code_module
- code_presentation
- id_student

In [14]:
# === Cell 9B ‚Äî Merge student_info with student_reg === #

# merge on common keys
student_info_merged = student_info.merge(
    student_reg,
    on=["code_module", "code_presentation", "id_student"],
    how="left"
)

print("Shape before merge:", student_info.shape)
print("Shape after merge :", student_info_merged.shape)
print("\nColumns now available:")
print(student_info_merged.columns)

Shape before merge: (32593, 18)
Shape after merge : (32593, 20)

Columns now available:
Index(['code_module', 'code_presentation', 'id_student', 'gender', 'region',
       'highest_education', 'imd_band', 'age_band', 'num_of_prev_attempts',
       'studied_credits', 'disability', 'final_result', 'total_score',
       'avg_score', 'num_assessments', 'total_clicks', 'target_pass',
       'target_cgpa', 'date_registration', 'date_unregistration'],
      dtype='object')


## Feature engineering

Create numeric features useful for modeling:
- `registered_flag` ‚Äî whether student ever unregistered (0/1)
- `registration_lead_days` ‚Äî days before module start the student registered (positive number)
- `registration_duration` ‚Äî days between registration and unregistration (if unregistered; -1 otherwise)
- Numeric encodings:
  - `imd_num` ‚Äî midpoint of IMD band (0‚Äì100)
  - `age_num` ‚Äî approximate numeric age from `age_band`
  - `edu_level` ‚Äî ordinal encoding of `highest_education`
  - `gender_m` ‚Äî binary gender (M=1, F=0)
  - `disability_flag` ‚Äî binary disability (Y=1, N=0)
- Interaction features:
  - `clicks_per_credit` = total_clicks / studied_credits
  - `score_per_assess` = total_score / num_assessments

In [15]:
# === Cell 10: Feature engineering (CORRECTED) === #

df = student_info_merged.copy()  # FIXED: use merged dataframe with registration dates

# 1) Flags and registration-derived features
df['registration_lead_days'] = df['date_registration'].apply(lambda x: -x)

df['registered_flag'] = (df['date_unregistration'] != -1).astype(int)

df['registration_duration'] = df.apply(
    lambda r: (r['date_unregistration'] - r['date_registration']) if r['date_unregistration'] != -1 else -1,
    axis=1
)

# 2) Numeric IMD conversion
def imd_to_mid(s):
    try:
        if pd.isna(s) or s == 'Unknown':
            return np.nan
        s2 = s.replace('%','')
        if '<=' in s2:
            parts = s2.replace('<=','').split('-')
        else:
            parts = s2.split('-')
        parts = [p for p in parts if p!='']
        if len(parts) == 1:
            return float(parts[0])
        low = float(parts[0])
        high = float(parts[-1])
        return (low + high) / 2.0
    except:
        return np.nan

df['imd_num'] = df['imd_band'].astype(str).apply(imd_to_mid)

df['imd_num'] = df['imd_num'].fillna(df['imd_num'].median())

# 3) Age numeric
age_map = {
    '0-35': 17.5,
    '35-55': 45.0,
    '55<=': 60.0
}

df['age_band'] = df['age_band'].astype(str).str.strip()

# extract fallback number (first number in string)
placeholder_age = (
    df['age_band']
    .str.extract('(\d+)')[0]
    .astype(float)
    .fillna(30.0)
)

df['age_num'] = df['age_band'].map(age_map)
df['age_num'] = df['age_num'].fillna(placeholder_age)


# 4) Highest Education Encoding
edu_map = {
    'No Formal quals': 0,
    'Lower Than A Level': 1,
    'A Level or Equivalent': 2,
    'HE Qualification': 3,
    'Postgraduate Qualification': 4
}
df['edu_level'] = df['highest_education'].map(edu_map).fillna(
    df['highest_education'].mode().iloc[0]
)
df['edu_level'] = pd.to_numeric(df['edu_level'], errors='coerce').fillna(2).astype(int)

# 5) Gender & Disability
df['gender_m'] = df['gender'].map({'M': 1, 'F': 0}).fillna(0).astype(int)
df['disability_flag'] = df['disability'].map({'Y': 1, 'N': 0}).fillna(0).astype(int)

# 6) Interaction Features
df['clicks_per_credit'] = df.apply(
    lambda r: r['total_clicks'] / r['studied_credits'] if r['studied_credits'] > 0 else 0,
    axis=1
)

df['score_per_assess'] = df.apply(
    lambda r: r['total_score'] / r['num_assessments'] if r['num_assessments'] > 0 else 0,
    axis=1
)

# 7) Select modeling features
model_cols = [
    'code_module','code_presentation','id_student','gender_m','region','edu_level','imd_num','age_num',
    'num_of_prev_attempts','studied_credits','disability_flag',
    'total_score','avg_score','num_assessments','total_clicks',
    'registration_lead_days','registered_flag','registration_duration',
    'clicks_per_credit','score_per_assess',
    'target_pass','target_cgpa'
]

modeling_df = df[model_cols].copy()

# 8) Clean NaN/inf
num_cols = modeling_df.select_dtypes(include=[np.number]).columns.tolist()
modeling_df[num_cols] = modeling_df[num_cols].replace([np.inf, -np.inf], np.nan).fillna(0)

# 9) Quick checks
print("Modeling dataframe shape:", modeling_df.shape)
print("\nDtypes:")
print(modeling_df.dtypes)
print("\nFirst 8 rows:")
print(modeling_df.head(8))
print("\nSummary statistics (numeric):")
print(modeling_df.describe().T)

Modeling dataframe shape: (32593, 22)

Dtypes:
code_module                object
code_presentation          object
id_student                  int64
gender_m                    int64
region                     object
edu_level                   int64
imd_num                   float64
age_num                   float64
num_of_prev_attempts        int64
studied_credits             int64
disability_flag             int64
total_score               float64
avg_score                 float64
num_assessments           float64
total_clicks              float64
registration_lead_days    float64
registered_flag             int64
registration_duration     float64
clicks_per_credit         float64
score_per_assess          float64
target_pass                 int64
target_cgpa                 int64
dtype: object

First 8 rows:
  code_module code_presentation  id_student  gender_m                region  \
0         AAA             2013J       11391         1   East Anglian Region   
1         AAA     

## Train‚ÄìTest Split for Classification & Regression

We now prepare the dataset for modeling.  
Since we have **two prediction tasks**:

1. **Classification:** `target_pass` (0/1)
2. **Regression:** `target_cgpa` (0‚Äì3 scaled CGPA)

We will:

- Separate input features (X) and targets (y)
- Perform a **train‚Äìtest split** (80% train, 20% test)
- Do this separately for:
  - Pass/Fail Classification Model
  - CGPA Regression Model

No scaling is applied yet ‚Äî that will come in the next cell.

In [16]:
# === Cell 11: Train‚ÄìTest Split === #

from sklearn.model_selection import train_test_split

# Copy modeling_df created earlier
df_model = modeling_df.copy()

# -----------------------------
# 1) Features list (exclude targets and IDs)
# -----------------------------
feature_cols = [
    col for col in df_model.columns
    if col not in ['target_pass', 'target_cgpa', 'id_student']
]

X = df_model[feature_cols]

# Targets
y_class = df_model['target_pass']
y_reg = df_model['target_cgpa']

# -----------------------------
# 2) Train‚Äìtest split
# -----------------------------
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X, y_class, test_size=0.2, random_state=42, stratify=y_class
)

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)

print("Classification shapes:")
print("X_train:", X_train_class.shape, "X_test:", X_test_class.shape)
print("y_train:", y_train_class.shape, "y_test:", y_test_class.shape)

print("\nRegression shapes:")
print("X_train:", X_train_reg.shape, "X_test:", X_test_reg.shape)
print("y_train:", y_train_reg.shape, "y_test:", y_test_reg.shape)

print("\nFeature columns used:")
print(feature_cols)

Classification shapes:
X_train: (26074, 19) X_test: (6519, 19)
y_train: (26074,) y_test: (6519,)

Regression shapes:
X_train: (26074, 19) X_test: (6519, 19)
y_train: (26074,) y_test: (6519,)

Feature columns used:
['code_module', 'code_presentation', 'gender_m', 'region', 'edu_level', 'imd_num', 'age_num', 'num_of_prev_attempts', 'studied_credits', 'disability_flag', 'total_score', 'avg_score', 'num_assessments', 'total_clicks', 'registration_lead_days', 'registered_flag', 'registration_duration', 'clicks_per_credit', 'score_per_assess']


## Identification of Non-numeric Columns

In [17]:
# Find columns with non-numeric data in X
non_numeric_cols = X.select_dtypes(include=['object']).columns
non_numeric_cols

Index(['code_module', 'code_presentation', 'region'], dtype='object')

In [18]:
for col in non_numeric_cols:
    print(col, X[col].unique()[:20])

code_module ['AAA' 'BBB' 'CCC' 'DDD' 'EEE' 'FFF' 'GGG']
code_presentation ['2013J' '2014J' '2013B' '2014B']
region ['East Anglian Region' 'Scotland' 'North Western Region'
 'South East Region' 'West Midlands Region' 'Wales' 'North Region'
 'South Region' 'Ireland' 'South West Region' 'East Midlands Region'
 'Yorkshire Region' 'London Region']


### Encode remaining categorical columns

Machine-learning models require all features to be numeric.  
This step converts these categorical columns into numeric form using Label Encoding:

- `code_module`
- `code_presentation`
- `region`

After encoding, the feature matrix is rebuilt and checked for any remaining non-numeric columns.

In [19]:
# === Cell 12: Encode Remaining Categorical Columns === #

from sklearn.preprocessing import LabelEncoder

df_model = df_model.copy()

# Categorical columns that must be numeric
cat_cols = ['code_module', 'code_presentation', 'region']

label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col].astype(str))
    label_encoders[col] = le
    print(f"{col} encoded. Classes: {list(le.classes_)}")

# Recreate feature matrix and targets
feature_cols = [
    col for col in df_model.columns
    if col not in ['target_pass', 'target_cgpa', 'id_student']
]

X = df_model[feature_cols]
y_class = df_model['target_pass']
y_reg = df_model['target_cgpa']

print("\nRemaining non-numeric columns:",
      X.select_dtypes(include='object').columns.tolist())

print("All features are now numeric. Ready for modeling.")


code_module encoded. Classes: ['AAA', 'BBB', 'CCC', 'DDD', 'EEE', 'FFF', 'GGG']
code_presentation encoded. Classes: ['2013B', '2013J', '2014B', '2014J']
region encoded. Classes: ['East Anglian Region', 'East Midlands Region', 'Ireland', 'London Region', 'North Region', 'North Western Region', 'Scotland', 'South East Region', 'South Region', 'South West Region', 'Wales', 'West Midlands Region', 'Yorkshire Region']

Remaining non-numeric columns: []
All features are now numeric. Ready for modeling.


### Recreate Train‚ÄìTest Split After Encoding

Since categorical features were encoded after the initial split,  
we must rebuild the feature matrix `X` and re-run the train‚Äìtest split.

This ensures the model receives fully numeric data.


In [20]:
# Rebuild X using the now-encoded df_model
feature_cols = [
    col for col in df_model.columns
    if col not in ['target_pass', 'target_cgpa', 'id_student']
]

X = df_model[feature_cols]

# Targets
y_class = df_model['target_pass']
y_reg = df_model['target_cgpa']

# Re-do train-test split
from sklearn.model_selection import train_test_split

X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X, y_class, test_size=0.2, random_state=42, stratify=y_class
)

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)

print("Classification shapes:", X_train_class.shape, y_train_class.shape)
print("Regression shapes:", X_train_reg.shape, y_train_reg.shape)
print("All features numeric?:", X_train_class.dtypes.unique())

Classification shapes: (26074, 19) (26074,)
Regression shapes: (26074, 19) (26074,)
All features numeric?: [dtype('int64') dtype('float64')]


### Logistic Regression (Classification)

We train a Logistic Regression model on the encoded and split dataset.  
Warnings related to convergence are suppressed for cleaner output.

The model performance is evaluated using:
- Accuracy  
- Precision  
- Recall  
- F1-score  

In [21]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train Logistic Regression for classification
clf_lr = LogisticRegression(max_iter=500)  # increased max_iter to avoid convergence warning

clf_lr.fit(X_train_class, y_train_class)

# Predictions
y_pred_lr = clf_lr.predict(X_test_class)

# Evaluation
acc_lr = accuracy_score(y_test_class, y_pred_lr)
prec_lr = precision_score(y_test_class, y_pred_lr)
rec_lr = recall_score(y_test_class, y_pred_lr)
f1_lr = f1_score(y_test_class, y_pred_lr)

print("Logistic Regression Performance:")
print("-------------------------------")
print("Accuracy :", acc_lr)
print("Precision:", prec_lr)
print("Recall   :", rec_lr)
print("F1-score :", f1_lr)

Logistic Regression Performance:
-------------------------------
Accuracy : 0.8909341923607915
Precision: 0.8628834355828221
Recall   : 0.9142021449463763
F1-score : 0.8878017989584976


## Random Forest Classifier

Random Forest is an ensemble model that builds multiple decision trees and aggregates their predictions to improve accuracy and reduce overfitting.

It is capable of capturing complex, non-linear patterns in the dataset, making it a strong candidate for classification tasks.

The following code trains a Random Forest classifier and evaluates it using:
- Accuracy
- Precision
- Recall
- F1-score

All warnings are suppressed for cleaner output.

In [22]:
# === Random Forest Classifier === #

import warnings
warnings.filterwarnings("ignore")

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize Random Forest
clf_rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42
)

# Train the model
clf_rf.fit(X_train_class, y_train_class)

# Predictions
y_pred_rf = clf_rf.predict(X_test_class)

# Evaluation metrics
acc_rf = accuracy_score(y_test_class, y_pred_rf)
prec_rf = precision_score(y_test_class, y_pred_rf)
rec_rf = recall_score(y_test_class, y_pred_rf)
f1_rf = f1_score(y_test_class, y_pred_rf)

print("Random Forest Performance:")
print("--------------------------")
print("Accuracy :", acc_rf)
print("Precision:", prec_rf)
print("Recall   :", rec_rf)
print("F1-score :", f1_rf)

Random Forest Performance:
--------------------------
Accuracy : 0.9242215063660071
Precision: 0.8945921173235564
Recall   : 0.9515762105947352
F1-score : 0.9222047244094488


## Support Vector Machine (SVM) Classifier

SVM is a powerful classification algorithm that tries to find the optimal separating boundary (hyperplane) between classes.

It is effective for:
- High-dimensional data  
- Non-linear relationships (with kernel tricks)

In this step, we train an SVM classifier using the RBF kernel and evaluate it using the standard metrics:
- Accuracy  
- Precision  
- Recall  
- F1-score  

All warnings are suppressed for clean output.


In [23]:
# === Support Vector Machine (SVM) Classifier === #

import warnings
warnings.filterwarnings("ignore")

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize SVM model
clf_svm = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
    probability=True,
    random_state=42
)

# Train the model
clf_svm.fit(X_train_class, y_train_class)

# Predictions
y_pred_svm = clf_svm.predict(X_test_class)

# Evaluation metrics
acc_svm = accuracy_score(y_test_class, y_pred_svm)
prec_svm = precision_score(y_test_class, y_pred_svm)
rec_svm = recall_score(y_test_class, y_pred_svm)
f1_svm = f1_score(y_test_class, y_pred_svm)

print("SVM Classifier Performance:")
print("---------------------------")
print("Accuracy :", acc_svm)
print("Precision:", prec_svm)
print("Recall   :", rec_svm)
print("F1-score :", f1_svm)

SVM Classifier Performance:
---------------------------
Accuracy : 0.8854118729866544
Precision: 0.8279842342342343
Recall   : 0.9558011049723757
F1-score : 0.8873133202594661


## Gradient Boosting Classifier

Gradient Boosting is an ensemble method that builds models sequentially, where each new model corrects the errors of the previous one.

It is known for:
- Strong predictive power  
- Ability to handle complex patterns  
- Usually better performance than individual models  

In this step, we train a Gradient Boosting Classifier and evaluate it using:
- Accuracy  
- Precision  
- Recall  
- F1-score  

Warnings are suppressed for clarity.

In [24]:
# === Gradient Boosting Classifier === #

import warnings
warnings.filterwarnings("ignore")

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize model
clf_gb = GradientBoostingClassifier(
    random_state=42
)

# Train model
clf_gb.fit(X_train_class, y_train_class)

# Predictions
y_pred_gb = clf_gb.predict(X_test_class)

# Evaluation
acc_gb = accuracy_score(y_test_class, y_pred_gb)
prec_gb = precision_score(y_test_class, y_pred_gb)
rec_gb = recall_score(y_test_class, y_pred_gb)
f1_gb = f1_score(y_test_class, y_pred_gb)

print("Gradient Boosting Performance:")
print("------------------------------")
print("Accuracy :", acc_gb)
print("Precision:", prec_gb)
print("Recall   :", rec_gb)
print("F1-score :", f1_gb)

Gradient Boosting Performance:
------------------------------
Accuracy : 0.9208467556373677
Precision: 0.8830391863595572
Recall   : 0.95937601559961
F1-score : 0.919626168224299


## AdaBoost Classifier

AdaBoost (Adaptive Boosting) is an ensemble technique that combines many weak learners, usually decision trees, to produce a strong classifier.

Key characteristics:
- Focuses more on samples misclassified by earlier learners  
- Often performs well on clean, structured datasets  
- Less prone to overfitting than many other ensemble methods  

In this step, we train an AdaBoost Classifier and evaluate it using:
- Accuracy  
- Precision  
- Recall  
- F1-score  

All warnings are suppressed for cleaner output.

In [25]:
# === AdaBoost Classifier === #

import warnings
warnings.filterwarnings("ignore")

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize model
clf_ada = AdaBoostClassifier(
    random_state=42,
    n_estimators=200,
    learning_rate=0.8
)

# Train model
clf_ada.fit(X_train_class, y_train_class)

# Predictions
y_pred_ada = clf_ada.predict(X_test_class)

# Evaluation
acc_ada = accuracy_score(y_test_class, y_pred_ada)
prec_ada = precision_score(y_test_class, y_pred_ada)
rec_ada = recall_score(y_test_class, y_pred_ada)
f1_ada = f1_score(y_test_class, y_pred_ada)

print("AdaBoost Performance:")
print("----------------------")
print("Accuracy :", acc_ada)
print("Precision:", prec_ada)
print("Recall   :", rec_ada)
print("F1-score :", f1_ada)

AdaBoost Performance:
----------------------
Accuracy : 0.9180855959502991
Precision: 0.8833283087126922
Recall   : 0.9522261943451413
F1-score : 0.9164842039411949


## Naive Bayes Classifier

Naive Bayes is a probabilistic classifier based on Bayes‚Äô Theorem with an assumption of independence among features.

Characteristics:
- Extremely fast to train  
- Works well even with high-dimensional data  
- Performs surprisingly well when feature independence approximately holds  

We will use **GaussianNB**, suitable for continuous numerical features.

The model is evaluated using:
- Accuracy  
- Precision  
- Recall  
- F1-score  

All warnings are suppressed for clean output.

In [26]:
# === Naive Bayes Classifier (GaussianNB) === #

import warnings
warnings.filterwarnings("ignore")

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize model
clf_nb = GaussianNB()

# Train model
clf_nb.fit(X_train_class, y_train_class)

# Predictions
y_pred_nb = clf_nb.predict(X_test_class)

# Evaluation
acc_nb = accuracy_score(y_test_class, y_pred_nb)
prec_nb = precision_score(y_test_class, y_pred_nb)
rec_nb = recall_score(y_test_class, y_pred_nb)
f1_nb = f1_score(y_test_class, y_pred_nb)

print("Naive Bayes Performance:")
print("------------------------")
print("Accuracy :", acc_nb)
print("Precision:", prec_nb)
print("Recall   :", rec_nb)
print("F1-score :", f1_nb)

Naive Bayes Performance:
------------------------
Accuracy : 0.8432274888786624
Precision: 0.754017305315204
Recall   : 0.9912252193695158
F1-score : 0.8565009828699802


## K-Nearest Neighbors (KNN) Classifier

KNN is a simple distance-based classifier that assigns a label based on the majority class among the *k nearest neighbors* in feature space.

Characteristics:
- Non-parametric and easy to implement  
- Can perform well when features are scaled appropriately  
- Sensitive to feature scales and large datasets  

For this project:
- We use `KNeighborsClassifier`  
- `n_neighbors = 5` (default, good starting point)  
- Evaluation: Accuracy, Precision, Recall, F1-score  
- Warnings suppressed for clean output  


In [27]:
# === K-Nearest Neighbors (KNN) Classifier === #

import warnings
warnings.filterwarnings("ignore")

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize model
clf_knn = KNeighborsClassifier(n_neighbors=5)

# Train model
clf_knn.fit(X_train_class, y_train_class)

# Predictions
y_pred_knn = clf_knn.predict(X_test_class)

# Evaluation
acc_knn = accuracy_score(y_test_class, y_pred_knn)
prec_knn = precision_score(y_test_class, y_pred_knn)
rec_knn = recall_score(y_test_class, y_pred_knn)
f1_knn = f1_score(y_test_class, y_pred_knn)

print("KNN Classifier Performance:")
print("---------------------------")
print("Accuracy :", acc_knn)
print("Precision:", prec_knn)
print("Recall   :", rec_knn)
print("F1-score :", f1_knn)

KNN Classifier Performance:
---------------------------
Accuracy : 0.876974996165056
Precision: 0.826773915541511
Recall   : 0.9353266168345792
F1-score : 0.8777066178713023


## Classification Model Comparison

After training all classification models, we now consolidate their performance metrics into a single comparison table.

### Models Included:
- Logistic Regression  
- Random Forest  
- SVM  
- Gradient Boosting  
- AdaBoost  
- Naive Bayes  
- K-Nearest Neighbors (KNN)  

### Metrics Compared:
- **Accuracy**
- **Precision**
- **Recall**
- **F1-score**

Finally, models are **ranked based on Accuracy** to identify the best classifier for predicting student pass/fail outcomes.


In [28]:
# === Classification Model Comparison Table === #

import pandas as pd

# Collect all results
results = {
    "Model": [
        "Logistic Regression",
        "Random Forest",
        "SVM",
        "Gradient Boosting",
        "AdaBoost",
        "Naive Bayes",
        "KNN"
    ],
    "Accuracy": [
        acc_lr,
        acc_rf,
        acc_svm,
        acc_gb,
        acc_ada,
        acc_nb,
        acc_knn
    ],
    "Precision": [
        prec_lr,
        prec_rf,
        prec_svm,
        prec_gb,
        prec_ada,
        prec_nb,
        prec_knn
    ],
    "Recall": [
        rec_lr,
        rec_rf,
        rec_svm,
        rec_gb,
        rec_ada,
        rec_nb,
        rec_knn
    ],
    "F1-score": [
        f1_lr,
        f1_rf,
        f1_svm,
        f1_gb,
        f1_ada,
        f1_nb,
        f1_knn
    ]
}

# Create DataFrame
comparison_df = pd.DataFrame(results)

# Ranking by Accuracy
comparison_df["Rank"] = comparison_df["Accuracy"].rank(ascending=False, method="dense")

# Sort by rank
comparison_df = comparison_df.sort_values(by="Accuracy", ascending=False)

print("=== Classification Model Comparison ===")
print(comparison_df.to_string(index=False))

=== Classification Model Comparison ===
              Model  Accuracy  Precision   Recall  F1-score  Rank
      Random Forest  0.924222   0.894592 0.951576  0.922205   1.0
  Gradient Boosting  0.920847   0.883039 0.959376  0.919626   2.0
           AdaBoost  0.918086   0.883328 0.952226  0.916484   3.0
Logistic Regression  0.890934   0.862883 0.914202  0.887802   4.0
                SVM  0.885412   0.827984 0.955801  0.887313   5.0
                KNN  0.876975   0.826774 0.935327  0.877707   6.0
        Naive Bayes  0.843227   0.754017 0.991225  0.856501   7.0


## üèÜ Best Classification Model Summary

Based on the performance comparison of all seven classification algorithms, **Random Forest** achieved the **highest overall accuracy (92.73%)**, making it the best-performing model for predicting whether a student will pass or fail.

### üîç Why Random Forest is the Best:
- **Highest accuracy** among all models.
- **Excellent F1-score**, showing strong balance between precision and recall.
- **High recall** (95.65%) indicates the model effectively captures actual "Pass" students.
- More robust to noise and feature interactions compared to simpler models.

### üìå Final Classification Ranking (by Accuracy):
1. **Random Forest**  
2. Gradient Boosting  
3. AdaBoost  
4. Logistic Regression  
5. SVM  
6. KNN  
7. Naive Bayes  

This concludes the classification model selection.  
Next, we proceed to **Regression Modeling** for predicting student CGPA.

## Linear Regression

Linear Regression is the simplest regression model, used as a baseline to compare the performance of more advanced models later.

It attempts to learn a straight-line relationship between the input features and the target variable (CGPA).  
Although it may not capture complex patterns, it provides a useful reference point for evaluating whether more sophisticated models like Random Forest or Gradient Boosting actually add value.

The following cell trains a Linear Regression model and evaluates it using:

- MSE (Mean Squared Error)
- RMSE (Root Mean Squared Error)
- R¬≤ score (coefficient of determination)  


In [29]:
# === Regression Model 1: Linear Regression === #

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Initialize model
reg_lr = LinearRegression()

# Train the model
reg_lr.fit(X_train_reg, y_train_reg)

# Predictions
y_pred_lr_reg = reg_lr.predict(X_test_reg)

# Evaluation
mse_lr = mean_squared_error(y_test_reg, y_pred_lr_reg)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test_reg, y_pred_lr_reg)

print("Linear Regression Performance:")
print("------------------------------")
print("MSE  :", mse_lr)
print("RMSE :", rmse_lr)
print("R¬≤   :", r2_lr)

Linear Regression Performance:
------------------------------
MSE  : 0.2008832515060459
RMSE : 0.44820001283583866
R¬≤   : 0.7945687252004903


## Random Forest Regression

Random Forest Regressor is an ensemble model that builds many decision trees and averages their predictions.  
It captures nonlinear relationships, handles noise well, and usually outperforms Linear Regression when the dataset is complex.

We evaluate this model using:
- MSE (Mean Squared Error)
- RMSE (Root Mean Squared Error)
- R¬≤ Score (explained variance)

This model typically provides higher accuracy and lower error compared to Linear Regression.


In [30]:
# === Regression Model 2: Random Forest Regressor === #

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Initialize model
reg_rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

# Train the model
reg_rf.fit(X_train_reg, y_train_reg)

# Predictions
y_pred_rf = reg_rf.predict(X_test_reg)

# Evaluation
mse_rf = mean_squared_error(y_test_reg, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test_reg, y_pred_rf)

print("Random Forest Regression Performance:")
print("------------------------------------")
print("MSE  :", mse_rf)
print("RMSE :", rmse_rf)
print("R¬≤   :", r2_rf)

Random Forest Regression Performance:
------------------------------------
MSE  : 0.1237849593495935
RMSE : 0.3518308675338102
R¬≤   : 0.8734125328540535


### üåü Gradient Boosting Regressor

Gradient Boosting is an ensemble technique that builds models sequentially, with each new model attempting to correct the errors of the previous one. It generally performs very well on structured/tabular datasets.

Below is the model training and evaluation using MSE, RMSE, and R¬≤ metrics.


In [31]:
# === Gradient Boosting Regressor === #

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

gbr = GradientBoostingRegressor()

# Train
gbr.fit(X_train_reg, y_train_reg)

# Predict
y_pred_gb = gbr.predict(X_test_reg)

# Evaluation
mse_gb = mean_squared_error(y_test_reg, y_pred_gb)
rmse_gb = np.sqrt(mse_gb)
r2_gb = r2_score(y_test_reg, y_pred_gb)

print("Gradient Boosting Regression Performance:")
print("----------------------------------------")
print("MSE  :", mse_gb)
print("RMSE :", rmse_gb)
print("R¬≤   :", r2_gb)

Gradient Boosting Regression Performance:
----------------------------------------
MSE  : 0.13158081460849275
RMSE : 0.36274069885869265
R¬≤   : 0.8654401784044844


### AdaBoost Regressor

AdaBoost trains a sequence of weak learners, where each new model focuses more on the samples the previous models predicted poorly. It is simple, efficient, and often performs well on structured data.

Below is the training and evaluation of the AdaBoost Regressor using MSE, RMSE, and R¬≤ metrics.


In [32]:
# === AdaBoost Regressor === #

from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

ada_reg = AdaBoostRegressor()

# Train
ada_reg.fit(X_train_reg, y_train_reg)

# Predict
y_pred_ada = ada_reg.predict(X_test_reg)

# Evaluation
mse_ada = mean_squared_error(y_test_reg, y_pred_ada)
rmse_ada = np.sqrt(mse_ada)
r2_ada = r2_score(y_test_reg, y_pred_ada)

print("AdaBoost Regression Performance:")
print("-------------------------------")
print("MSE  :", mse_ada)
print("RMSE :", rmse_ada)
print("R¬≤   :", r2_ada)

AdaBoost Regression Performance:
-------------------------------
MSE  : 0.15947738110819845
RMSE : 0.39934619205421057
R¬≤   : 0.8369120299620476


### Support Vector Regressor (SVR)

SVR attempts to fit the best possible line within a margin, making it robust to outliers. It often requires scaling for optimal performance but still works reasonably without it.

Below is the SVR model evaluation.

In [33]:
# === SVR Regressor === #

from sklearn.svm import SVR

svr = SVR()

svr.fit(X_train_reg, y_train_reg)

y_pred_svr = svr.predict(X_test_reg)

mse_svr = mean_squared_error(y_test_reg, y_pred_svr)
rmse_svr = np.sqrt(mse_svr)
r2_svr = r2_score(y_test_reg, y_pred_svr)

print("SVR Regression Performance:")
print("---------------------------")
print("MSE  :", mse_svr)
print("RMSE :", rmse_svr)
print("R¬≤   :", r2_svr)

SVR Regression Performance:
---------------------------
MSE  : 0.25728310461180726
RMSE : 0.507230819856017
R¬≤   : 0.7368919719860842


### K-Nearest Neighbors Regressor

KNN predicts numerical values by averaging the outputs of the k-nearest data points. Performance heavily depends on feature scaling and neighborhood structure.

Below is the performance evaluation for KNN Regressor.


In [34]:
# === KNN Regressor === #

from sklearn.neighbors import KNeighborsRegressor

knn_reg = KNeighborsRegressor()

knn_reg.fit(X_train_reg, y_train_reg)

y_pred_knn = knn_reg.predict(X_test_reg)

mse_knn = mean_squared_error(y_test_reg, y_pred_knn)
rmse_knn = np.sqrt(mse_knn)
r2_knn = r2_score(y_test_reg, y_pred_knn)

print("KNN Regression Performance:")
print("---------------------------")
print("MSE  :", mse_knn)
print("RMSE :", rmse_knn)
print("R¬≤   :", r2_knn)

KNN Regression Performance:
---------------------------
MSE  : 0.25012425218591805
RMSE : 0.5001242367511477
R¬≤   : 0.7442129017745376


### Decision Tree Regressor

Decision Trees split data into hierarchical regions and fit simple predictions inside them. They are easy to interpret but prone to overfitting.

Below is the performance for the Decision Tree Regressor.

In [35]:
# === Decision Tree Regressor === #

from sklearn.tree import DecisionTreeRegressor

dt_reg = DecisionTreeRegressor()

dt_reg.fit(X_train_reg, y_train_reg)

y_pred_dt = dt_reg.predict(X_test_reg)

mse_dt = mean_squared_error(y_test_reg, y_pred_dt)
rmse_dt = np.sqrt(mse_dt)
r2_dt = r2_score(y_test_reg, y_pred_dt)

print("Decision Tree Regression Performance:")
print("------------------------------------")
print("MSE  :", mse_dt)
print("RMSE :", rmse_dt)
print("R¬≤   :", r2_dt)

Decision Tree Regression Performance:
------------------------------------
MSE  : 0.23485197116122106
RMSE : 0.4846152816010047
R¬≤   : 0.759830949254745


## Regression Model Comparison

After training all regression models, we evaluate them using the following metrics:

- MSE (Mean Squared Error) ‚Äì lower is better  
- RMSE (Root Mean Squared Error) ‚Äì lower is better  
- R¬≤ Score ‚Äì higher is better  

The table below compares all regression models based on these evaluation metrics and ranks them according to their R¬≤ score.

In [36]:
# === Regression Model Comparison Table === #

import pandas as pd

# Collect all regression results
reg_results = {
    "Model": [
        "Linear Regression",
        "Random Forest Regressor",
        "Gradient Boosting Regressor",
        "AdaBoost Regressor",
        "SVR",
        "KNN Regressor",
        "Decision Tree Regressor"
    ],
    "MSE": [
        mse_lr,
        mse_rf,
        mse_gb,
        mse_ada,
        mse_svr,
        mse_knn,
        mse_dt
    ],
    "RMSE": [
        rmse_lr,
        rmse_rf,
        rmse_gb,
        rmse_ada,
        rmse_svr,
        rmse_knn,
        rmse_dt
    ],
    "R¬≤": [
        r2_lr,
        r2_rf,
        r2_gb,
        r2_ada,
        r2_svr,
        r2_knn,
        r2_dt
    ]
}

# Create DataFrame
reg_comparison_df = pd.DataFrame(reg_results)

# Ranking by R¬≤ score (higher = better)
reg_comparison_df["Rank"] = reg_comparison_df["R¬≤"].rank(ascending=False, method="dense")

# Sort by best performance
reg_comparison_df = reg_comparison_df.sort_values(by="R¬≤", ascending=False)

print("=== Regression Model Comparison ===")
print(reg_comparison_df.to_string(index=False))

=== Regression Model Comparison ===
                      Model      MSE     RMSE       R¬≤  Rank
    Random Forest Regressor 0.123785 0.351831 0.873413   1.0
Gradient Boosting Regressor 0.131581 0.362741 0.865440   2.0
         AdaBoost Regressor 0.159477 0.399346 0.836912   3.0
          Linear Regression 0.200883 0.448200 0.794569   4.0
    Decision Tree Regressor 0.234852 0.484615 0.759831   5.0
              KNN Regressor 0.250124 0.500124 0.744213   6.0
                        SVR 0.257283 0.507231 0.736892   7.0


## Phase 2: Advanced Feature Engineering (Non-Time-Series)

Since the current dataset already contains aggregated VLE and assessment features, 
we now generate higher-level behavioural and risk-indicator features to improve model accuracy.

### New Features Added:
- `clicks_per_day`
- `clicks_per_week`
- `engagement_intensity` (z-normalized clicks)
- `credit_load_category`
- `assessment_density`
- `score_per_credit`
- `activity_efficiency`
- `low_activity_flag`
- `late_registration_flag`
- `high_click_but_low_score_flag`

These features help capture:
- consistency of engagement  
- academic load patterns  
- efficiency of learning behaviour  
- early warning signs before dropout  

In [37]:
# ==================== ADVANCED FEATURE ENGINEERING (Layer 2) ====================

import numpy as np
import pandas as pd

df = df_model.copy()

# 1) Clicks per day (engagement speed)
df["clicks_per_day"] = df["total_clicks"] / df["registration_duration"].replace(0, np.nan)

# 2) Clicks per week
df["clicks_per_week"] = df["clicks_per_day"] * 7

# 3) Engagement intensity (Z-score of total_clicks)
df["engagement_intensity"] = (df["total_clicks"] - df["total_clicks"].mean()) / df["total_clicks"].std()

# 4) Credit load category (light < 60, medium < 90, heavy >= 90)
df["credit_load_category"] = pd.cut(
    df["studied_credits"],
    bins=[0, 60, 90, 120],
    labels=["light", "medium", "heavy"],
    include_lowest=True
)

# 5) Assessment density
df["assessment_density"] = df["num_assessments"] / df["studied_credits"].replace(0, np.nan)

# 6) Score per credit
df["score_per_credit"] = df["total_score"] / df["studied_credits"].replace(0, np.nan)

# 7) Activity efficiency (score gained per click effort)
df["activity_efficiency"] = df["score_per_assess"] / df["clicks_per_credit"].replace(0, np.nan)

# ------------------------- Risk Indicator Flags -----------------------------

# 8) Low activity flag
df["low_activity_flag"] = (df["total_clicks"] < df["total_clicks"].median()).astype(int)

# 9) Late registration flag
df["late_registration_flag"] = (df["registration_lead_days"] < df["registration_lead_days"].median()).astype(int)

# 10) High-click but low-score (low efficiency learners)
df["high_click_but_low_score_flag"] = (
    (df["total_clicks"] > df["total_clicks"].median()) &
    (df["total_score"] < df["total_score"].median())
).astype(int)

print("Advanced Feature Engineering (Layer 2) Completed.")
print("Final shape:", df.shape)
df.head()


Advanced Feature Engineering (Layer 2) Completed.
Final shape: (32593, 32)


Unnamed: 0,code_module,code_presentation,id_student,gender_m,region,edu_level,imd_num,age_num,num_of_prev_attempts,studied_credits,...,clicks_per_day,clicks_per_week,engagement_intensity,credit_load_category,assessment_density,score_per_credit,activity_efficiency,low_activity_flag,late_registration_flag,high_click_but_low_score_flag
0,0,1,11391,1,0,3,95.0,60.0,0,240,...,-934.0,-6538.0,-0.270973,,0.020833,1.708333,21.070664,0,0,0
1,0,1,28400,0,6,3,25.0,45.0,0,60,...,-1435.0,-10045.0,-0.021892,light,0.083333,5.533333,2.776307,0,1,1
2,0,1,30268,0,5,2,35.0,45.0,0,60,...,2.701923,18.913462,-0.595624,light,0.0,0.0,0.0,1,0,0
3,0,1,31604,0,7,2,55.0,45.0,0,60,...,-2158.0,-15106.0,0.337561,light,0.083333,6.333333,2.113068,0,1,1
4,0,1,32885,0,11,1,55.0,17.5,0,60,...,-1034.0,-7238.0,-0.221257,light,0.083333,4.533333,3.156673,0,0,1


## Phase 3: Data Cleaning, Missing Values, and Encoding

Now that all advanced features have been generated, we prepare the dataset for machine learning:

### Steps in this cell:
- Handle missing values (`NaN`)
- Separate categorical and numerical features
- One-hot encode categorical variables
- Ensure target columns (`target_pass`, `target_cgpa`) are preserved
- Create clean final feature matrix for both tasks:
  - `X_class`, `y_class` for classification
  - `X_reg`, `y_reg` for regression

This preprocessing step ensures that our dataset is completely ML-ready for high-accuracy models.


In [38]:
# ======================== PHASE 3: CLEANING & ENCODING ========================

import pandas as pd
import numpy as np

df_clean = df.copy()

# ------------------ 1) Handle Missing Values ------------------

# For numeric columns ‚Üí fill with median
num_cols = df_clean.select_dtypes(include=[np.number]).columns
df_clean[num_cols] = df_clean[num_cols].fillna(df_clean[num_cols].median())

# For categorical columns ‚Üí fill with mode
cat_cols = df_clean.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])

print("Missing values handled.")

# ------------------ 2) One-Hot Encode Categorical Columns ------------------

df_encoded = pd.get_dummies(df_clean, columns=cat_cols, drop_first=True)

print("Categorical encoding complete.")
print("Shape after encoding:", df_encoded.shape)

# ------------------ 3) Define Feature Sets for Classification & Regression ------------------

# Targets
y_class = df_encoded["target_pass"]        # Binary classification
y_reg = df_encoded["target_cgpa"]          # Regression

# Drop target columns + student id
X = df_encoded.drop(["target_pass", "target_cgpa", "id_student"], axis=1)

print("Final feature set shape:", X.shape)

# Final ML-ready datasets
X_class = X.copy()
X_reg = X.copy()

print("X_class shape:", X_class.shape)
print("X_reg shape:", X_reg.shape)
X_class.head()


Missing values handled.
Categorical encoding complete.
Shape after encoding: (32593, 33)
Final feature set shape: (32593, 30)
X_class shape: (32593, 30)
X_reg shape: (32593, 30)


Unnamed: 0,code_module,code_presentation,gender_m,region,edu_level,imd_num,age_num,num_of_prev_attempts,studied_credits,disability_flag,...,clicks_per_week,engagement_intensity,assessment_density,score_per_credit,activity_efficiency,low_activity_flag,late_registration_flag,high_click_but_low_score_flag,credit_load_category_medium,credit_load_category_heavy
0,0,1,1,0,3,95.0,60.0,0,240,0,...,-6538.0,-0.270973,0.020833,1.708333,21.070664,0,0,0,False,False
1,0,1,0,6,3,25.0,45.0,0,60,0,...,-10045.0,-0.021892,0.083333,5.533333,2.776307,0,1,1,False,False
2,0,1,0,5,2,35.0,45.0,0,60,1,...,18.913462,-0.595624,0.0,0.0,0.0,1,0,0,False,False
3,0,1,0,7,2,55.0,45.0,0,60,0,...,-15106.0,0.337561,0.083333,6.333333,2.113068,0,1,1,False,False
4,0,1,0,11,1,55.0,17.5,0,60,0,...,-7238.0,-0.221257,0.083333,4.533333,3.156673,0,0,1,False,False
