In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel('./Grad_responses.xlsx', header=0)

# Data Augmentation

In [None]:
df.columns = df.columns.str.strip()

current_rows = df.shape[0]
target_rows = 2000
rows_to_add = target_rows - current_rows

In [None]:

synthetic_data = df.sample(n=rows_to_add, replace=True).reset_index(drop=True)

variation_columns = [
    "How do you prefer to learn new information? (select all that apply)",
    "How often do you benefit from visual aids (e.g., diagrams, animations)?",
    "Which of the following tools helps you understand topics better?",
    "Do you prefer learning at your own pace or following a structured schedule?",
    "What challenges do you face in traditional classroom learning? (Select all that apply)",
    "Do you feel that traditional learning methods address your individual needs?",
    "How important is feedback to you while learning new material?",
    "Do you think that gamified elements (e.g., points, levels, badges) make learning more enjoyable?",
    "What motivates you to stay engaged with learning?",
    "How comfortable are you using technology (e.g., apps, online platforms) for learning?",
    "Do you trust AI-based systems to provide personalized learning recommendations?",
    "Would you like explanations for why a system recommends certain lessons or feedback?",
    "How likely are you to use an AI tutoring system if it provides personalized content, feedback, and gamified quizzes?",
]

In [None]:
for col in variation_columns:
    unique_values = df[col].dropna().unique()
    synthetic_data[col] = np.random.choice(unique_values, size=rows_to_add)

start_date = pd.to_datetime("2025-01-01")
end_date = pd.to_datetime("2025-02-01")
synthetic_data["Timestamp"] = pd.to_datetime(
    np.random.uniform(start_date.timestamp(), end_date.timestamp(), size=rows_to_add), unit='s'
)

augmented_df = pd.concat([df, synthetic_data], ignore_index=True)

augmented_df.to_excel("Augmented_Grad_responses.xlsx", index=False)

print("Augmentation complete. The dataset is saved as 'Augmented_Grad_responses.xlsx'.")


Augmentation complete. The dataset is saved as 'Augmented_Grad_responses.xlsx'.


# Cleaning

In [None]:
df = pd.read_excel("Augmented_Grad_responses.xlsx")

In [None]:
df.columns = df.columns.str.strip()

if "Timestamp" in df.columns:
    df = df.drop(columns=["Timestamp"])

likert_mapping = {
    "Always": 3, "Often": 2, "Sometimes": 1, "Rarely": 0, "Never": 0
}
df["How often do you benefit from visual aids (e.g., diagrams, animations)?"] = df["How often do you benefit from visual aids (e.g., diagrams, animations)?"].map(likert_mapping)

binary_mapping = {"Yes": 1, "No": 0, "Somewhat": 0.5}
for col in [
    "Do you trust AI-based systems to provide personalized learning recommendations?",
    "Do you think that gamified elements (e.g., points, levels, badges) make learning more enjoyable?",
    "Would you like explanations for why a system recommends certain lessons or feedback?",
    "Have you used educational tools or platforms with gamified elements before?"
]:
    df[col] = df[col].map(binary_mapping)

feedback_mapping = {"Very important": 3, "Important": 2, "Somewhat important": 1, "Not important": 0}
df["How important is feedback to you while learning new material?"] = df["How important is feedback to you while learning new material?"].map(feedback_mapping)


# Feature engineering

In [None]:
import pandas as pd

# Load the dataset
file_path = "Augmented_Grad_responses.xlsx"  
df = pd.read_excel(file_path)

df.columns = df.columns.str.strip()

def contains_preference(row, keyword):
    return 1 if keyword in str(row) else 0

learning_pref_col = "How do you prefer to learn new information? (select all that apply)"

if learning_pref_col not in df.columns:
    raise ValueError(f"Missing column: {learning_pref_col}. Ensure augmentation is correct.")

df["Visual_Learner"] = df[learning_pref_col].apply(lambda x: contains_preference(x, "Watching videos or animations"))
df["Auditory_Learner"] = df[learning_pref_col].apply(lambda x: contains_preference(x, "Listening to explanations"))
df["Kinesthetic_Learner"] = df[learning_pref_col].apply(lambda x: contains_preference(x, "Hands-on practice"))

df = df.drop(columns=[learning_pref_col])

df.to_excel("Processed_Grad_responses.xlsx", index=False)


Feature engineering completed successfully! The dataset is saved as 'Processed_Grad_responses.xlsx'.


# KNN

In [21]:
import pandas as pd

df = pd.read_excel("Processed_Grad_responses.xlsx")

In [None]:
df = df.select_dtypes(exclude=["datetime64"])

df.fillna(df.mode().iloc[0], inplace=True)

learning_style_columns = ["Visual_Learner", "Auditory_Learner", "Kinesthetic_Learner"]
X = pd.get_dummies(df.drop(columns=learning_style_columns))  
Y = df[learning_style_columns]

# Print dataset shape
print("Dataset Loaded Successfully!")
print(f"Feature Shape: {X.shape}, Label Shape: {Y.shape}")


Dataset Loaded Successfully!
Feature Shape: (2000, 121), Label Shape: (2000, 3)


In [23]:
from sklearn.model_selection import train_test_split

# Split data into 80% training and 20% testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("Data Split Completed!")
print(f"Training Set Size: {X_train.shape[0]} samples")
print(f"Testing Set Size: {X_test.shape[0]} samples")


Data Split Completed!
Training Set Size: 1600 samples
Testing Set Size: 400 samples


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Define the hyperparameter grid
param_grid = {
    "n_neighbors": [5, 10, 15, 20, 25, 30],  
    "weights": ["uniform", "distance"],  
    "metric": ["euclidean", "manhattan", "minkowski"]  
}

# Initialize KNN model
knn = KNeighborsClassifier()

grid_search = GridSearchCV(knn, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, Y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)


Best Hyperparameters: {'metric': 'euclidean', 'n_neighbors': 25, 'weights': 'uniform'}


In [None]:
best_knn = KNeighborsClassifier(
    n_neighbors=best_params["n_neighbors"],
    weights=best_params["weights"],
    metric=best_params["metric"]
)
best_knn.fit(X_train, Y_train)

Y_pred_final = best_knn.predict(X_test)

print("Optimized KNN Model Trained Successfully!")

Optimized KNN Model Trained Successfully!


In [None]:
from sklearn.metrics import classification_report

print("Optimized Model Evaluation:\n", classification_report(Y_test, Y_pred_final.round()))

Optimized Model Evaluation:
               precision    recall  f1-score   support

           0       0.53      0.65      0.59       222
           1       0.54      0.55      0.54       225
           2       0.54      0.69      0.60       211

   micro avg       0.54      0.63      0.58       658
   macro avg       0.54      0.63      0.58       658
weighted avg       0.54      0.63      0.58       658
 samples avg       0.53      0.60      0.52       658



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
import joblib

# Save the optimized model
joblib.dump(best_knn, "knn_learning_style_model.pkl")
print("Optimized model saved successfully as: optimized_knn_learning_style_model.pkl")


Optimized model saved successfully as: optimized_knn_learning_style_model.pkl
