In [1]:
%pip install transformers imbalanced-learn nltk owlready2 pandas scikit-learn torch


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("personalized_learning_dataset.csv")

# Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Fill missing numerical values with mean
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

# Fill missing categorical values with mode
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Encode categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le




In [22]:
# Ensure all Course_Name entries are strings and drop NaNs
cleaned_texts = df['Course_Name'].fillna("").astype(str)
text_embeddings = get_bert_embeddings(cleaned_texts)


Embedding Batches: 100%|██████████| 625/625 [00:36<00:00, 17.20it/s]


In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel
from tqdm import tqdm

# Load DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
bert_model.eval()

def get_bert_embeddings(texts):
    embeddings_list = []
    batch_size = 16  # small batches to avoid memory errors
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding Batches"):
        batch_texts = texts[i:i+batch_size].tolist()
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=64)
        with torch.no_grad():
            outputs = bert_model(**inputs)
        # Use the first token ([CLS]-like) representation
        embeddings = outputs.last_hidden_state[:, 0, :]
        embeddings_list.append(embeddings.cpu().numpy())
    
    return np.vstack(embeddings_list)

# Example: Assume df is already loaded with your dataset
# and 'Course_Name' column exists
cleaned_texts = df['Course_Name'].fillna("").astype(str)

text_embeddings = get_bert_embeddings(cleaned_texts)
# Create DataFrame for BERT features
text_features = pd.DataFrame(
    text_embeddings, columns=[f'bert_{i}' for i in range(text_embeddings.shape[1])]
)

# Add BERT features to your main dataframe
df = pd.concat([df.reset_index(drop=True), text_features.reset_index(drop=True)], axis=1)



Embedding Batches: 100%|██████████| 625/625 [00:36<00:00, 17.05it/s]


In [24]:
df.head()


Unnamed: 0,Student_ID,Age,Gender,Education_Level,Course_Name,Time_Spent_on_Videos,Quiz_Attempts,Quiz_Scores,Forum_Participation,Assignment_Completion_Rate,...,bert_758,bert_759,bert_760,bert_761,bert_762,bert_763,bert_764,bert_765,bert_766,bert_767
0,0,15,0,0,2,171,4,67,2,89,...,0.160273,0.010927,0.022738,-0.004101,0.231194,-0.110267,-0.071948,-0.058186,0.146181,0.268397
1,1,49,1,2,3,156,4,64,0,94,...,0.177312,0.077665,0.065067,-0.131997,0.138796,-0.189738,-0.02281,0.008576,0.151267,0.284651
2,2,20,0,2,3,217,2,55,2,67,...,0.177312,0.077665,0.065067,-0.131997,0.138796,-0.189738,-0.02281,0.008576,0.151267,0.284651
3,3,37,0,2,1,489,1,65,43,60,...,0.25854,0.056169,0.042675,-0.08527,0.178497,-0.155818,0.014036,-0.065531,0.168249,0.386918
4,4,34,0,1,3,496,3,59,34,88,...,0.177312,0.077665,0.065067,-0.131997,0.138796,-0.189738,-0.02281,0.008576,0.151267,0.284651


In [None]:
from imblearn.over_sampling import SMOTE

# Define X and y
X = df.drop(columns=['Dropout_Likelihood'])  # Exclude target column
y = df['Dropout_Likelihood']  # Target column

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


In [27]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Define and train the MLP model
mlp = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300, random_state=42)
mlp.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = mlp.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.54      0.92      0.68      1606
           1       0.73      0.21      0.33      1612

    accuracy                           0.57      3218
   macro avg       0.63      0.57      0.51      3218
weighted avg       0.63      0.57      0.50      3218



In [25]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X = df.drop(columns=['Learning_Style'])  # or Dropout_Likelihood
y = df['Learning_Style']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.25      0.23      0.24       517
           1       0.22      0.23      0.23       459
           2       0.26      0.28      0.27       524
           3       0.25      0.24      0.25       500

    accuracy                           0.25      2000
   macro avg       0.25      0.25      0.25      2000
weighted avg       0.25      0.25      0.25      2000



* Owlready2 * Running HermiT...
    java -Xmx2000M -cp C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\owlready2\hermit;C:\Users\HP\AppData\Roaming\Python\Python311\site-packages\owlready2\hermit\HermiT.jar org.semanticweb.HermiT.cli.CommandLine -c -O -D -I file:///C:/Users/HP/AppData/Local/Temp/tmp_ozoyw_c


Recommend VideoLecture for student_1


* Owlready2 * HermiT took 1.5231435298919678 seconds
* Owlready * (NB: only changes on entities loaded in Python are shown, other changes are done but not listed)


In [49]:
from owlready2 import *

# Load the ontology
onto = get_ontology("learning_ontology.owl").load()

# Create ContentType class and its individuals if not present
if "ContentType" not in onto.classes():
    with onto:
        class ContentType(Thing):
            pass

ContentType = onto.ContentType

# Create or get individuals
VideoLecture = onto.search_one(iri="*VideoLecture") or ContentType("VideoLecture")
AudioLecture = onto.search_one(iri="*AudioLecture") or ContentType("AudioLecture")
TextLecture = onto.search_one(iri="*TextLecture") or ContentType("TextLecture")

# Create LearningStyle class if not present
if "LearningStyle" not in onto.classes():
    with onto:
        class LearningStyle(Thing):
            pass

LearningStyle = onto.LearningStyle

# Create or get individuals
Visual = onto.search_one(iri="*Visual") or LearningStyle("Visual")
Auditory = onto.search_one(iri="*Auditory") or LearningStyle("Auditory")

# Define prefersContentType property if not already defined
if "prefersContentType" not in onto.properties():
    with onto:
        class prefersContentType(ObjectProperty):
            domain = [LearningStyle]
            range = [ContentType]

# Assign preferred content types (only if not already present to avoid duplicates)
if VideoLecture not in Visual.prefersContentType:
    Visual.prefersContentType.append(VideoLecture)

if AudioLecture not in Auditory.prefersContentType:
    Auditory.prefersContentType.append(AudioLecture)

# Save the updated ontology
onto.save(file="learning_ontology_updated.owl", format="rdfxml")
print("Ontology updated and saved.")


Ontology updated and saved.
