### Importing the necessary libraries

In [None]:
import numpy as np
import pandas as pd
from tensorflow import keras
from keras.layers import Input, Embedding, LSTM, Dense, Bidirectional, SimpleRNN
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Loading the dataset

In [None]:
df = pd.read_csv("/kaggle/input/questionanalyserdataset/QuestionAnalyserDataset.csv")
df = df[['Questions','Blooms Taxonomy']]
df

## Tokenizing the words in the Dataframe

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Questions'])
sequences = tokenizer.texts_to_sequences(df['Questions'])
max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['Blooms Taxonomy'])
vocab_size = len(tokenizer.word_index) + 1

## Spliting the data into training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)
print(type(X_train))

## Defining the model architecture
Note that Here I have used the Bidirectional LSTM model *(As this topic is similar to Sentiment analysis)* 

In [None]:
inputs = Input(shape=(max_len,))
x = Embedding(vocab_size, 128, input_length=max_len)(inputs)
x = Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2))(x)
outputs = Dense(50, activation='sigmoid')(x)
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:
# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=64)

In [None]:
score, acc = model.evaluate(X_test, y_test, verbose=2)
print("Validation accuracy:", acc)

## Make predictions on new Data

In [None]:
new_questions = ['What types of programming languages are vulnerable to buffer overflows?', 'Construct the Binary Search Tree using following data. Show each steps. 32, 45, 12, 11, 13, 92, 78, 66, 17, 70,98, 108. Show its Preorder, Inorder and Postorder traversing sequences.']
new_sequences = tokenizer.texts_to_sequences(new_questions)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_len, padding='post')
predictions = model.predict(new_padded_sequences)
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))
print(predicted_labels)


# Implementation using Glove

In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd
from tensorflow import keras
from keras.layers import Input, Embedding, LSTM, Dense, Bidirectional
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
# Load the Bloom's Taxonomy dataset
#df = pd.read_csv("QuestionAnalyserDatasetUpdated.csv")
#df = df[['Questions','Blooms Taxonomy']]  

In [None]:
# Load the pre-trained GloVe embeddings
word_embeddings = {}
with open('/kaggle/input/glove6b/glove.6B.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs

In [None]:
# Tokenize the text and convert it to sequences
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df['Questions'])
sequences = tokenizer.texts_to_sequences(df['Questions'])
max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

In [None]:
# Create an embedding matrix for the pre-trained GloVe embeddings
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 300))
for word, i in tokenizer.word_index.items():
    embedding_vector = word_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# Convert the labels to numerical values
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['Blooms Taxonomy'])

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [None]:
# Define the model architecture
inputs = Input(shape=(max_len,))
x = Embedding(len(tokenizer.word_index) + 1, 300, weights=[embedding_matrix], input_length=max_len, trainable=False)(inputs)
x = Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2))(x)
outputs = Dense(80, activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)
optimizer = keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.summary()


In [None]:
# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=64)

In [None]:
score, acc = model.evaluate(X_test, y_test, verbose=2)
print("Validation accuracy:", acc)

In [None]:
# Make predictions on new data
new_questions = ['What can be the maximum number of nodes in binary tree with height 4?', 'Write an algorithm to insert a node at beginning in circular linked list.']
new_sequences = tokenizer.texts_to_sequences(new_questions)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_len, padding='post')
predictions = model.predict(new_padded_sequences)
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))
print(predicted_labels)

In [None]:
import numpy as np
import pandas as pd
from tensorflow import keras
from keras.layers import Input, Embedding, LSTM, Dense, Bidirectional
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_csv("/kaggle/input/questionanalyserdataset/QuestionAnalyserDataset.csv")
df = df[['Questions','Blooms Taxonomy']]

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Questions'])
sequences = tokenizer.texts_to_sequences(df['Questions'])
max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
vocab_size = len(tokenizer.word_index) + 1

# Encode the labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['Blooms Taxonomy'])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Define the model architecture
inputs = Input(shape=(max_len,))
x = Embedding(vocab_size, 128, input_length=max_len)(inputs)
x = Bidirectional(SimpleRNN(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))(x)
x = Bidirectional(SimpleRNN(64, dropout=0.2, recurrent_dropout=0.2))(x)
x = Dense(64, activation='relu')(x)
outputs = Dense(50, activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=32)

# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

In [2]:
import numpy as np
import pandas as pd
from tensorflow import keras
from keras.layers import Input, Embedding, LSTM, Dense, Bidirectional
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold

In [3]:
# Load the data
df = pd.read_csv("/kaggle/input/questionanalyserdataset/QuestionAnalyserDataset.csv")
df = df[['Questions','Blooms Taxonomy']]

In [5]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Questions'])
sequences = tokenizer.texts_to_sequences(df['Questions'])
max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
vocab_size = len(tokenizer.word_index) + 1

In [6]:
# Encode the labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['Blooms Taxonomy'])

In [7]:
# Define the model architecture
inputs = Input(shape=(max_len,))
x = Embedding(vocab_size, 128, input_length=max_len)(inputs)
x = Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))(x)
x = Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2))(x)
x = Dense(64, activation='relu')(x)
outputs = Dense(50, activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)

# Define k-fold cross validation
k = 5
kfold = KFold(n_splits=k, shuffle=True)

# Perform k-fold cross validation
test_losses = []
test_accuracies = []


In [9]:
for train_idx, test_idx in kfold.split(padded_sequences):
    X_train, X_test = padded_sequences[train_idx], padded_sequences[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]

    # Compile the model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, verbose=0)

    # Evaluate the model on the test set
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
    test_losses.append(test_loss)
    test_accuracies.append(test_acc)


KeyboardInterrupt: 

In [None]:
# Print the average test loss and accuracy across all folds
print('Average test loss:', np.mean(test_losses))
print('Average test accuracy:', np.mean(test_accuracies))

## Bidirectional RNN using Cross Validation (k=5)

In [11]:
import numpy as np
import pandas as pd
from tensorflow import keras
from keras.layers import Input, Embedding, LSTM, Dense, Bidirectional, SimpleRNN
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold

# Load the data
df = pd.read_csv("/kaggle/input/questionanalyserdataset/QuestionAnalyserDataset.csv")
df = df[['Questions','Blooms Taxonomy']]

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Questions'])
sequences = tokenizer.texts_to_sequences(df['Questions'])
max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
vocab_size = len(tokenizer.word_index) + 1

# Encode the labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['Blooms Taxonomy'])

# Define the model architecture
inputs = Input(shape=(max_len,))
x = Embedding(vocab_size, 128, input_length=max_len)(inputs)
x = Bidirectional(LS(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))(x)
x = Bidirectional(SimpleRNN(64, dropout=0.2, recurrent_dropout=0.2))(x)
x = Dense(64, activation='relu')(x)
outputs = Dense(50, activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)

# Define k-fold cross validation
k = 5
kfold = KFold(n_splits=k, shuffle=True)

# Perform k-fold cross validation
test_losses = []
test_accuracies = []
for i, (train_idx, test_idx) in enumerate(kfold.split(padded_sequences)):
    X_train, X_test = padded_sequences[train_idx], padded_sequences[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]

    # Compile the model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Train the model
    print('Fold:', i+1)
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=64, verbose=1)

    # Evaluate the model on the test set
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
    test_losses.append(test_loss)
    test_accuracies.append(test_acc)

# Print the average test loss and accuracy across all folds
print('Average test loss:', np.mean(test_losses))
print('Average test accuracy:', np.mean(test_accuracies))


Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 203)]             0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 203, 128)          409088    
_________________________________________________________________
bidirectional_6 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_6 (Dense)              (None, 50)                12850     
Total params: 685,106
Trainable params: 685,106
Non-trainable params: 0
_________________________________________________________________
Fold: 1
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50

KeyboardInterrupt: 

## Bidirectional LSTM using Cross validation (k = 5)

In [None]:
import numpy as np
import pandas as pd
from tensorflow import keras
from keras.layers import Input, Embedding, LSTM, Dense, Bidirectional, SimpleRNN
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold

# Load the data
df = pd.read_csv("/kaggle/input/questionanalyserdataset/QuestionAnalyserDataset.csv")
df = df[['Questions','Blooms Taxonomy']]

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Questions'])
sequences = tokenizer.texts_to_sequences(df['Questions'])
max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
vocab_size = len(tokenizer.word_index) + 1

# Encode the labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['Blooms Taxonomy'])

inputs = Input(shape=(max_len,))
x = Embedding(vocab_size, 128, input_length=max_len)(inputs)
x = Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2))(x)
outputs = Dense(50, activation='sigmoid')(x)
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

# Define k-fold cross validation
k = 5
kfold = KFold(n_splits=k, shuffle=True)

# Perform k-fold cross validation
test_losses = []
test_accuracies = []
for i, (train_idx, test_idx) in enumerate(kfold.split(padded_sequences)):
    X_train, X_test = padded_sequences[train_idx], padded_sequences[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]

    # Compile the model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Train the model
    print('Fold:', i+1)
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=64, verbose=1)

    # Evaluate the model on the test set
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
    test_losses.append(test_loss)
    test_accuracies.append(test_acc)

# Print the average test loss and accuracy across all folds
print('Average test loss:', np.mean(test_losses))
print('Average test accuracy:', np.mean(test_accuracies))