In [4]:
# -*- coding: utf-8 -*-
"""Bloom's Taxonomy Level Classification using Word2Vec and RandomForest."""

import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score

# Load dataset
df = pd.read_csv("/content/combined_dataset.csv")

# Display basic dataset info
print("Dataset shape:", df.shape)
print("First few rows:\n", df.head())

# Preprocess text data
df['CLEAN_QUESTION'] = df['QUESTION'].apply(lambda x: gensim.utils.simple_preprocess(x))

# Map Bloom's Taxonomy levels to numerical values
bt_level_mapping = {
    'Remembering': 0,
    'Understanding': 1,
    'Applying': 2,
    'Analyzing': 3,
    'Evaluating': 4,
    'Creating': 5
}
df['BT LEVEL'] = df['BT LEVEL'].map(bt_level_mapping)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['CLEAN_QUESTION'], df['BT LEVEL'], test_size=0.2, random_state=42)

# Train Word2Vec model
w2v_model = gensim.models.Word2Vec(
    sentences=X_train,
    vector_size=100,
    window=1,
    min_count=2,
    workers=2
)

# Function to compute average word vectors for a sentence
def average_word_vectors(sentence, model, vector_size):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

# Convert sentences to vectors
X_train_vect = np.array([average_word_vectors(sentence, w2v_model, 100) for sentence in X_train])
X_test_vect = np.array([average_word_vectors(sentence, w2v_model, 100) for sentence in X_test])

# Train RandomForest model
rf = RandomForestClassifier(random_state=42)
rf_model = rf.fit(X_train_vect, y_train)

# Predict on test set
y_pred = rf_model.predict(X_test_vect)

# Evaluate model performance
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
accuracy = accuracy_score(y_test, y_pred)

print('Precision: {:.3f} / Recall: {:.3f} / Accuracy: {:.3f}'.format(precision, recall, accuracy))

# Example: Using Logistic Regression
from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_vect, y_train)

# Predict on test set
y_pred = model.predict(X_test_vect)

# Evaluate model performance
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
accuracy = accuracy_score(y_test, y_pred)

print('Precision: {:.3f} / Recall: {:.3f} / Accuracy: {:.3f}'.format(precision, recall, accuracy))

from sklearn.svm import SVC

# Initialize and train the model
model = SVC(kernel='linear', random_state=42)
model.fit(X_train_vect, y_train)


# Predict on test set
y_pred = model.predict(X_test_vect)

# Evaluate model performance
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
accuracy = accuracy_score(y_test, y_pred)

print('Precision: {:.3f} / Recall: {:.3f} / Accuracy: {:.3f}'.format(precision, recall, accuracy))

from xgboost import XGBClassifier

# Initialize and train the model
model = XGBClassifier(random_state=42)
model.fit(X_train_vect, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_vect)

# Evaluate model performance
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
accuracy = accuracy_score(y_test, y_pred)

print('Precision: {:.3f} / Recall: {:.3f} / Accuracy: {:.3f}'.format(precision, recall, accuracy))

from sklearn.neighbors import KNeighborsClassifier

# Initialize and train the model
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train_vect, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_vect)

# Evaluate model performance
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
accuracy = accuracy_score(y_test, y_pred)

print('Precision: {:.3f} / Recall: {:.3f} / Accuracy: {:.3f}'.format(precision, recall, accuracy))

from sklearn.neural_network import MLPClassifier

# Initialize and train the model
model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
model.fit(X_train_vect, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_vect)

# Evaluate model performance
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
accuracy = accuracy_score(y_test, y_pred)

print('Precision: {:.3f} / Recall: {:.3f} / Accuracy: {:.3f}'.format(precision, recall, accuracy))


Dataset shape: (2522, 2)
First few rows:
                                             QUESTION       BT LEVEL
0  Suppose prices of two goods are constant, expl...  Understanding
1  Explain the concept of price leadership observ...  Understanding
2  Define profit. Briefly explain how accounting ...  Understanding
3  Describe the assumptions of monopolistic compe...  Understanding
4  Explain the meaning of the law of diminishing ...  Understanding
RandomForest -> Precision: 0.793 / Recall: 0.388 / Accuracy: 0.513
Precision: 0.605 / Recall: 0.354 / Accuracy: 0.469
Precision: 0.224 / Recall: 0.174 / Accuracy: 0.347


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision: 0.057 / Recall: 0.167 / Accuracy: 0.341
Precision: 0.510 / Recall: 0.419 / Accuracy: 0.495
Precision: 0.289 / Recall: 0.263 / Accuracy: 0.345
Precision: 0.057 / Recall: 0.167 / Accuracy: 0.341


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# -*- coding: utf-8 -*-
"""Bloom's Taxonomy Level Classification using Word2Vec and Various Classifiers."""

import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# Load dataset
df = pd.read_csv("/content/combined_dataset.csv")

# Display basic dataset info
print("Dataset shape:", df.shape)
print("First few rows:\n", df.head())

# Preprocess text data
df['CLEAN_QUESTION'] = df['QUESTION'].apply(lambda x: gensim.utils.simple_preprocess(x))

# Map Bloom's Taxonomy levels to numerical values
bt_level_mapping = {
    'Remembering': 0,
    'Understanding': 1,
    'Applying': 2,
    'Analyzing': 3,
    'Evaluating': 4,
    'Creating': 5
}
df['BT LEVEL'] = df['BT LEVEL'].map(bt_level_mapping)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['CLEAN_QUESTION'], df['BT LEVEL'], test_size=0.2, random_state=42)

# Train Word2Vec model on the entire dataset
w2v_model = gensim.models.Word2Vec(
    sentences=df['CLEAN_QUESTION'],
    vector_size=100,
    window=5,
    min_count=1,
    workers=2
)

# Function to compute average word vectors for a sentence
def average_word_vectors(sentence, model, vector_size):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

# Convert sentences to vectors
X_train_vect = np.array([average_word_vectors(sentence, w2v_model, 100) for sentence in X_train])
X_test_vect = np.array([average_word_vectors(sentence, w2v_model, 100) for sentence in X_test])

# Apply PCA for dimensionality reduction
pca = PCA(n_components=50)  # Reduce vector size
X_train_pca = pca.fit_transform(X_train_vect)
X_test_pca = pca.transform(X_test_vect)

# Compute class weights for imbalance handling
classes = np.unique(y_train)
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))

# Train RandomForest with class weights
rf = RandomForestClassifier(random_state=42, class_weight=class_weight_dict)
rf_model = rf.fit(X_train_pca, y_train)

# Predict and evaluate RandomForest
y_pred = rf_model.predict(X_test_pca)
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
accuracy = accuracy_score(y_test, y_pred)
print('RandomForest -> Precision: {:.3f} / Recall: {:.3f} / Accuracy: {:.3f}'.format(precision, recall, accuracy))


# Train XGBoost with class weights
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train_pca, y_train)

# Predict and evaluate XGBoost
y_pred = xgb.predict(X_test_pca)
print('XGBoost -> Precision: {:.3f} / Recall: {:.3f} / Accuracy: {:.3f}'.format(
    precision_score(y_test, y_pred, average="macro"),
    recall_score(y_test, y_pred, average="macro"),
    accuracy_score(y_test, y_pred)
))


Dataset shape: (2522, 2)
First few rows:
                                             QUESTION       BT LEVEL
0  Suppose prices of two goods are constant, expl...  Understanding
1  Explain the concept of price leadership observ...  Understanding
2  Define profit. Briefly explain how accounting ...  Understanding
3  Describe the assumptions of monopolistic compe...  Understanding
4  Explain the meaning of the law of diminishing ...  Understanding
RandomForest -> Precision: 0.874 / Recall: 0.421 / Accuracy: 0.543
XGBoost -> Precision: 0.652 / Recall: 0.543 / Accuracy: 0.606


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# Define parameter grids for hyperparameter tuning
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],  # Regularization strength
    'solver': ['liblinear', 'saga'],  # Solvers for logistic regression
    'max_iter': [1000, 2000, 3000]
}

param_grid_svc = {
    'C': [0.1, 1, 10],  # Regularization strength
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree']
}

param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (150,)],  # Vary number of neurons
    'max_iter': [1000, 2000],
    'solver': ['adam', 'sgd'],  # Change optimizer
    'learning_rate': ['constant', 'adaptive']
}

# GridSearch for Logistic Regression
lr_grid_search = GridSearchCV(LogisticRegression(class_weight='balanced', random_state=42), param_grid_lr, cv=3, n_jobs=-1, scoring='accuracy')
lr_grid_search.fit(X_train_pca, y_train)
print("Best parameters for Logistic Regression:", lr_grid_search.best_params_)

# Train the best Logistic Regression model
lr_best = lr_grid_search.best_estimator_
lr_best.fit(X_train_pca, y_train)
y_pred_lr = lr_best.predict(X_test_pca)
print('Logistic Regression -> Precision: {:.3f} / Recall: {:.3f} / Accuracy: {:.3f}'.format(
    precision_score(y_test, y_pred_lr, average="macro"),
    recall_score(y_test, y_pred_lr, average="macro"),
    accuracy_score(y_test, y_pred_lr)
))

# GridSearch for SVC
svc_grid_search = GridSearchCV(SVC(class_weight='balanced', random_state=42), param_grid_svc, cv=3, n_jobs=-1, scoring='accuracy')
svc_grid_search.fit(X_train_pca, y_train)
print("Best parameters for SVC:", svc_grid_search.best_params_)

# Train the best SVC model
svc_best = svc_grid_search.best_estimator_
svc_best.fit(X_train_pca, y_train)
y_pred_svc = svc_best.predict(X_test_pca)
print('SVC -> Precision: {:.3f} / Recall: {:.3f} / Accuracy: {:.3f}'.format(
    precision_score(y_test, y_pred_svc, average="macro"),
    recall_score(y_test, y_pred_svc, average="macro"),
    accuracy_score(y_test, y_pred_svc)
))

# GridSearch for KNN
knn_grid_search = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=3, n_jobs=-1, scoring='accuracy')
knn_grid_search.fit(X_train_pca, y_train)
print("Best parameters for KNN:", knn_grid_search.best_params_)

# Train the best KNN model
knn_best = knn_grid_search.best_estimator_
knn_best.fit(X_train_pca, y_train)
y_pred_knn = knn_best.predict(X_test_pca)
print('KNN -> Precision: {:.3f} / Recall: {:.3f} / Accuracy: {:.3f}'.format(
    precision_score(y_test, y_pred_knn, average="macro"),
    recall_score(y_test, y_pred_knn, average="macro"),
    accuracy_score(y_test, y_pred_knn)
))

# GridSearch for MLP (without class_weight)
mlp_grid_search = GridSearchCV(MLPClassifier(random_state=42), param_grid_mlp, cv=3, n_jobs=-1, scoring='accuracy')
mlp_grid_search.fit(X_train_pca, y_train)
print("Best parameters for MLP:", mlp_grid_search.best_params_)

# Train the best MLP model
mlp_best = mlp_grid_search.best_estimator_
mlp_best.fit(X_train_pca, y_train)
y_pred_mlp = mlp_best.predict(X_test_pca)
print('MLP -> Precision: {:.3f} / Recall: {:.3f} / Accuracy: {:.3f}'.format(
    precision_score(y_test, y_pred_mlp, average="macro"),
    recall_score(y_test, y_pred_mlp, average="macro"),
    accuracy_score(y_test, y_pred_mlp)
))


Best parameters for Logistic Regression: {'C': 0.01, 'max_iter': 1000, 'solver': 'liblinear'}
Logistic Regression -> Precision: 0.115 / Recall: 0.197 / Accuracy: 0.352


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best parameters for SVC: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
SVC -> Precision: 0.299 / Recall: 0.250 / Accuracy: 0.311
Best parameters for KNN: {'algorithm': 'auto', 'n_neighbors': 7, 'weights': 'distance'}
KNN -> Precision: 0.392 / Recall: 0.341 / Accuracy: 0.410
