In [None]:
pip install tensorflow pandas numpy matplotlib scikit-learn imbalanced-learn

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow.keras import Input, Model
from tensorflow.keras.layers import (
    LSTM, Dense, Dropout, MultiHeadAttention, LayerNormalization, Flatten
)
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix


In [None]:
# Load datasets
assessments = pd.read_csv('/content/drive/MyDrive/OULAD-anonymisedData/assessments.csv')
student_assessment = pd.read_csv('/content/drive/MyDrive/OULAD-anonymisedData/studentAssessment.csv')
student_info = pd.read_csv('/content/drive/MyDrive/OULAD-anonymisedData/studentInfo.csv')
vle = pd.read_csv('/content/drive/MyDrive/OULAD-anonymisedData/vle.csv')



In [None]:
# Initialize a counter for bad lines
bad_line_count = 0

# Define the bad line handler function
def bad_line_handler(line, line_number):
    global bad_line_count
    bad_line_count += 1
    print(f"Skipping line {line_number}: {line}")
    return None

# Read the CSV file using the custom bad line handler
student_vle = pd.read_csv(
    '/content/drive/MyDrive/OULAD-anonymisedData/studentVle.csv',
    engine='python',
    on_bad_lines=bad_line_handler
)

# Display the number of bad lines skipped
print(f'Number of bad lines skipped: {bad_line_count}')

Number of bad lines skipped: 0


Merge Datasets

In [None]:
print(f"Columns in student_assessment: {student_assessment.columns}")
print(f"Columns in assessments: {assessments.columns}")
print(f"Columns in student_info: {student_info.columns}")
print(f"Columns in vle: {vle.columns}")
print(f"Columns in student_vle: {student_vle.columns}")

Columns in student_assessment: Index(['id_assessment', 'id_student', 'date_submitted', 'is_banked', 'score'], dtype='object')
Columns in assessments: Index(['code_module', 'code_presentation', 'id_assessment', 'assessment_type',
       'date', 'weight'],
      dtype='object')
Columns in student_info: Index(['code_module', 'code_presentation', 'id_student', 'gender', 'region',
       'highest_education', 'imd_band', 'age_band', 'num_of_prev_attempts',
       'studied_credits', 'disability', 'final_result'],
      dtype='object')
Columns in vle: Index(['id_site', 'code_module', 'code_presentation', 'activity_type',
       'week_from', 'week_to'],
      dtype='object')
Columns in student_vle: Index(['code_module', 'code_presentation', 'id_student', 'id_site', 'date',
       'sum_click'],
      dtype='object')


In [None]:
# Merge student info with assessment scores
student_data = pd.merge(student_assessment, student_info, on='id_student', how='left')
student_data = pd.merge(student_data, assessments, on='id_assessment', how='left')

In [None]:
# Normalize assessment dates relative to a 270-day course period
student_data['normalized_date'] = student_data['date_submitted'] / 270

In [None]:
# Define course segments
student_data['course_progress'] = pd.cut(
    student_data['normalized_date'],
    bins=[0, 0.2, 0.4, 0.6, 0.8, 1.0],
    labels=['20%', '40%', '60%', '80%', '100%']
)

# Convert course_progress into one-hot encoding
student_data = pd.get_dummies(student_data, columns=['course_progress'])

In [None]:
# Ensure time-series ordering for LSTM input
student_data = student_data.sort_values(by=['id_student', 'date_submitted'])

In [None]:
# Aggregate Clickstream Data: Weekly Interactions
student_vle['week'] = student_vle['date'] // 7

# Summarize total clicks per week per student
student_vle_weekly = student_vle.groupby(['id_student', 'week'])['sum_click'].sum().reset_index()

# Pivot to create weekly engagement features (one column per week)
student_vle_pivot = student_vle_weekly.pivot(index='id_student', columns='week', values='sum_click').fillna(0)

# Normalize the clickstream data
scaler = MinMaxScaler()
student_vle_pivot = pd.DataFrame(scaler.fit_transform(student_vle_pivot), columns=student_vle_pivot.columns, index=student_vle_pivot.index)

# Merge with student_data
student_data = student_data.merge(student_vle_pivot, on='id_student', how='left')
student_data = student_data.fillna(0)  # Fill any missing clickstream values

In [None]:
# Drop duplicate columns
student_data = student_data.drop(columns=['code_module_y', 'code_presentation_y'])

In [None]:
# Rename the remaining columns
student_data = student_data.rename(columns={'code_module_x': 'code_module', 'code_presentation_x': 'code_presentation'})

In [None]:
print(f"Columns in student_data: {student_data.columns}")

Columns in student_data: Index([       'id_assessment',           'id_student',       'date_submitted',
                  'is_banked',                'score',          'code_module',
          'code_presentation',               'gender',               'region',
          'highest_education',             'imd_band',             'age_band',
       'num_of_prev_attempts',      'studied_credits',           'disability',
               'final_result',      'assessment_type',                 'date',
                     'weight',      'normalized_date',  'course_progress_20%',
        'course_progress_40%',  'course_progress_60%',  'course_progress_80%',
       'course_progress_100%',                     -4,                     -3,
                           -2,                     -1,                      0,
                            1,                      2,                      3,
                            4,                      5,                      6,
                           

Convert Student Performance into Categories



In [None]:
# Remove rows where 'date_submitted' is invalid
student_data = student_data.dropna(subset=['date_submitted'])
student_data = student_data[student_data['date_submitted'] > 0]  # Remove negative/zero dates

In [None]:
print(f"Length in student_data: {len(student_data)}")

Length in student_data: 203544


In [None]:
# Merge studentVLE (clickstream) with student_info
student_vle_agg = student_vle.groupby(['id_student', 'code_module', 'code_presentation'])['sum_click'].sum().reset_index()

# Normalize clickstream interactions
scaler = MinMaxScaler()
student_vle_agg['normalized_clicks'] = scaler.fit_transform(student_vle_agg[['sum_click']])

# Merge with main student_data
student_data = student_data.merge(student_vle_agg, on=['id_student', 'code_module', 'code_presentation'], how='left')
student_data['normalized_clicks'] = student_data['normalized_clicks'].fillna(0)  # Fill missing values

In [None]:
# Convert 'final_result' into numerical categories
student_data['final_result'] = student_data['final_result'].replace({
    'Distinction': 3, 'Pass': 2, 'Fail': 1, 'Withdrawn': 0
})

# Encode categorical features
categorical_cols = ['gender', 'region', 'highest_education', 'imd_band', 'age_band', 'disability']
label_encoders = {col: LabelEncoder() for col in categorical_cols}

for col in categorical_cols:
  student_data[col] = student_data[col].astype(str)
  student_data[col] = label_encoders[col].fit_transform(student_data[col])


  student_data['final_result'] = student_data['final_result'].replace({


Normalize Scores

In [None]:
# Fill missing scores with the mean
student_data['score'] = student_data['score'].fillna(student_data['score'].mean())

# Normalize score column
student_data['score'] = scaler.fit_transform(student_data[['score']])


Select final features for model training

In [None]:
# Select final features for model training
# features = ['score', 'normalized_date', 'course_progress', 'normalized_clicks', 'gender', 'region', 'highest_education', 'imd_band', 'age_band', 'disability']
features = ['score', 'normalized_date', 'normalized_clicks', 'gender', 'region', 'highest_education', 'imd_band', 'age_band', 'disability'] + list(student_vle_pivot.columns)

X = student_data[features]
y = student_data['final_result']

# Convert categorical course_progress into one-hot encoding
# X = pd.get_dummies(X, columns=['course_progress'])

Create Time-Series Sequences
**bold text**Extract Time-Series Features for Each Student

In [None]:
sequence_length = 5  # Look at last 5 assessments

X, y = [], []
grouped = student_data.groupby('id_student')

for _, group in grouped:
    group = group.sort_values('date_submitted')

    scores = group['score'].values
    final_result = group['final_result'].iloc[-1]  # Use last known final_result

    if len(scores) >= sequence_length:
        seq = scores[-sequence_length:]  # Last 5 assessments
        X.append(seq)
        y.append(final_result)

X = np.array(X).reshape(-1, sequence_length, 1)  # Reshape for LSTM
y = np.array(y)

print(f"Dataset shape: X={X.shape}, y={y.shape}")


Handle Imbalanced Data with SMOTE


In [None]:
smote = SMOTE(random_state=42)
X_flat = X.reshape(X.shape[0], -1)
X_resampled, y_resampled = smote.fit_resample(X_flat, y)
X_resampled = X_resampled.reshape(-1, sequence_length, 1)

print(f"After SMOTE: X={X_resampled.shape}, y={y_resampled.shape}")

 Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)


In [None]:
# # Define function to train a model at each course progress stage
# def train_early_risk_model(stage):
#     print(f"\n Training Model for {stage} Course Progress...\n")

#     # Select only students at this stage
#     X_stage = student_data[student_data[f'course_progress_{stage}'] == 1].drop(columns=[col for col in student_data.columns if 'course_progress_' in col])
#     y_stage = X_stage['final_result']
#     X_stage = X_stage.drop(columns=['final_result'])

#     # Train/Test Split
#     X_train, X_test, y_train, y_test = train_test_split(X_stage, y_stage, test_size=0.2, random_state=42, stratify=y_stage)

#     # Build LSTM Model
#     input_layer = Input(shape=(X_train.shape[1],))  # No sequence now, since we flatten for each stage

#     x = Dense(64, activation='relu')(input_layer)
#     x = Dropout(0.2)(x)
#     x = Dense(32, activation='relu')(x)
#     output_layer = Dense(4, activation='softmax')(x)  # Classification

#     model = Model(inputs=input_layer, outputs=output_layer)
#     model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

#     # Train Model
#     model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.1)

#     # Evaluate Model
#     test_loss, test_acc = model.evaluate(X_test, y_test)
#     print(f" {stage} Progress Model Accuracy: {test_acc:.4f}\n")

#     return model

# # Train separate models for each course stage
# models_by_stage = {}
# for stage in ['20%', '40%', '60%', '80%', '100%']:
#     models_by_stage[stage] = train_early_risk_model(stage)


 Training Model for 20% Course Progress...



TypeError: argument of type 'int' is not iterable

In [None]:
print(grouped.head())

        id_assessment  id_student  date_submitted  is_banked  score  \
0                1758        6516              17          0   0.60   
1                1759        6516              51          0   0.48   
2                1760        6516             116          0   0.63   
3                1761        6516             164          0   0.61   
4                1762        6516             210          0   0.77   
...               ...         ...             ...        ...    ...   
203539          15020     2698588              18          0   1.00   
203540          15021     2698588              53          0   0.85   
203541          15022     2698588             109          0   0.87   
203542          15023     2698588             152          0   0.95   
203543          15024     2698588             202          0   0.95   

       code_module code_presentation  gender  region  highest_education  ...  \
0              AAA             2014J       1       9               

Build the Deep Learning Model

In [None]:
# Define Model Input
input_layer = Input(shape=(sequence_length, 1))

# 1) Multilayer LSTM
x = LSTM(64, activation='relu', return_sequences=True)(input_layer)
x = Dropout(0.2)(x)
x = LSTM(64, activation='relu', return_sequences=True)(x)
x = Dropout(0.2)(x)
x = LSTM(64, activation='relu', return_sequences=True)(x)  # Ensure output matches for MHSA

# 2) Multi-Head Self-Attention
attn_output = MultiHeadAttention(num_heads=4, key_dim=32)(x, x, x)
x = LayerNormalization()(attn_output+x)  # Normalize attention output
x = Flatten()(x)  # Flatten the attention output before feeding into Dense layers

# 3) ANN Layers
x = Dense(64, activation='relu')(x)
x = Dense(32, activation='relu')(x)
output_layer = Dense(4, activation='softmax')(x)  # 4-class classification

# Compile Model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()


Train the Model

In [None]:
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=64,
    validation_split=1e-5,
    verbose=1
)


Epoch 1/100
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 19ms/step - accuracy: 0.5917 - loss: 0.9538 - val_accuracy: 0.2500 - val_loss: 1.3482
Epoch 2/100
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - accuracy: 0.5944 - loss: 0.9422 - val_accuracy: 0.2500 - val_loss: 0.9335
Epoch 3/100
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - accuracy: 0.5977 - loss: 0.9402 - val_accuracy: 0.5000 - val_loss: 0.8447
Epoch 4/100
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 19ms/step - accuracy: 0.5989 - loss: 0.9317 - val_accuracy: 0.5000 - val_loss: 0.9509
Epoch 5/100
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 19ms/step - accuracy: 0.5997 - loss: 0.9303 - val_accuracy: 0.7500 - val_loss: 0.6738
Epoch 6/100
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 19ms/step - accuracy: 0.6000 - loss: 0.9298 - val_accuracy: 1.0000 - val_loss: 0.6723
Epoch 7/10

Evaluate the Model

In [None]:
# Evaluate on Test Data
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

# Predict on Test Data
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['Withdrawn', 'Fail', 'Pass', 'Distinction']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Withdrawn', 'Fail', 'Pass', 'Distinction'], yticklabels=['Withdrawn', 'Fail', 'Pass', 'Distinction'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


Early Risk Prediction

In [None]:
# Predict Early in Course (first 20% data)
X_early = X_test[:len(X_test)//5]  # First 20% of students
y_early_true = y_test[:len(y_test)//5]
y_early_pred_probs = model.predict(X_early)
y_early_pred = np.argmax(y_early_pred_probs, axis=1)

print("\nEarly Prediction Report:\n", classification_report(y_early_true, y_early_pred, target_names=['Withdrawn', 'Fail', 'Pass', 'Distinction']))


In [None]:
# Save the trained model
model.save("student_performance_prediction.h5")
print("Model saved successfully as 'student_performance_prediction.h5'")


In [None]:
# Pick a random student from test data
sample_index = np.random.randint(0, len(X_test))  # Select a random index
sample_sequence = X_test[sample_index]  # Extract the student's sequence

# Expand dimensions to match model input shape
sample_sequence = np.expand_dims(sample_sequence, axis=0)

# Predict student performance
predicted_probs = model.predict(sample_sequence)
predicted_class = np.argmax(predicted_probs, axis=1)[0]

# Map the prediction back to the original class labels
class_labels = {0: "Withdrawn", 1: "Fail", 2: "Pass", 3: "Distinction"}

# Print results
print(f"Actual Performance: {class_labels[y_test[sample_index]]}")
print(f"Predicted Performance: {class_labels[predicted_class]}")
