## Importing the necessary libraries

In [1]:
import numpy as np
import pandas as pd
from tensorflow import keras
from keras.layers import Input, Embedding, LSTM, Dense, Bidirectional, SimpleRNN
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Loading the dataset

In [2]:
df = pd.read_csv("/kaggle/input/questionanalyserdataset/QuestionAnalyserDataset.csv")
df = df[['Questions','BloomsTaxClass']]
df

Unnamed: 0,Questions,BloomsTaxClass
0,Given is an array after the first partition of...,remember
1,How many steps are required to solve Tower of ...,remember
2,How many comparisons are required to find elem...,remember
3,"Given an array A[-3:4, 6:10], Find the address...",remember
4,"Consider the following list of 10 numbers: 35,...","apply , remember"
...,...,...
748,Find out the names of all American actors abov...,evaluate
749,Retrieve the name of each actor together with ...,apply
750,Retrieve details of all films that were releas...,apply
751,Find out the names of all actors that have pla...,evaluate


## Tokenizing the words in the Dataframe

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Questions'])
sequences = tokenizer.texts_to_sequences(df['Questions'])
max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['BloomsTaxClass'])
vocab_size = len(tokenizer.word_index) + 1

## Spliting the data into training and testing sets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)
print(type(X_train))

<class 'numpy.ndarray'>


## Defining the model architecture
Note that Here I have used the Bidirectional LSTM model *(As this topic is similar to Sentiment analysis)* 

In [5]:
inputs = Input(shape=(max_len,))
x = Embedding(vocab_size, 128, input_length=max_len)(inputs)
x = Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2))(x)
outputs = Dense(50, activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 203)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 203, 128)          306432    
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               263168    
_________________________________________________________________
dense (Dense)                (None, 50)                12850     
Total params: 582,450
Trainable params: 582,450
Non-trainable params: 0
_________________________________________________________________


In [6]:
# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f43181a9d10>

In [7]:
score, acc = model.evaluate(X_test, y_test, verbose=2)
print("Validation accuracy:", acc)

5/5 - 0s - loss: 3.2163 - accuracy: 0.3775
Validation accuracy: 0.37748345732688904


## Make predictions on new Data

In [8]:
new_questions = ['What types of programming languages are vulnerable to buffer overflows?', 'Construct the Binary Search Tree using following data. Show each steps. 32, 45, 12, 11, 13, 92, 78, 66, 17, 70,98, 108. Show its Preorder, Inorder and Postorder traversing sequences.']
new_sequences = tokenizer.texts_to_sequences(new_questions)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_len, padding='post')
predictions = model.predict(new_padded_sequences)
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))
print(predicted_labels)


['Remember' 'create']


# Implementation using Glove

In [9]:
# Import the necessary libraries
import numpy as np
import pandas as pd
from tensorflow import keras
from keras.layers import Input, Embedding, LSTM, Dense, Bidirectional
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [10]:
# Load the Bloom's Taxonomy dataset
#df = pd.read_csv("QuestionAnalyserDatasetUpdated.csv")
#df = df[['Questions','Blooms Taxonomy']]  

In [11]:
# Load the pre-trained GloVe embeddings
word_embeddings = {}
with open('/kaggle/input/glove6b100dtxt/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs

In [12]:
# Tokenize the text and convert it to sequences
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df['Questions'])
sequences = tokenizer.texts_to_sequences(df['Questions'])
max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

In [13]:
# Create an embedding matrix for the pre-trained GloVe embeddings
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = word_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [14]:
# Convert the labels to numerical values
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['BloomsTaxClass'])

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [37]:
# Define the model architecture
inputs = Input(shape=(max_len,))
x = Embedding(len(tokenizer.word_index) + 1, 100, weights=[embedding_matrix], input_length=max_len, trainable=False)(inputs)
x = Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2))(x)
outputs = Dense(50, activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)
optimizer = keras.optimizers.Adam(learning_rate=0.005)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.summary()


Model: "model_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         [(None, 203)]             0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 203, 100)          239400    
_________________________________________________________________
bidirectional_8 (Bidirection (None, 256)               234496    
_________________________________________________________________
dense_8 (Dense)              (None, 50)                12850     
Total params: 486,746
Trainable params: 247,346
Non-trainable params: 239,400
_________________________________________________________________


In [None]:
# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50

In [36]:
score, acc = model.evaluate(X_test, y_test, verbose=2)
print("Validation accuracy:", acc)

5/5 - 0s - loss: 2.9013 - accuracy: 0.3974
Validation accuracy: 0.3973509967327118


In [19]:
# Make predictions on new data
new_questions = ['What can be the maximum number of nodes in binary tree with height 4?', 'Write an algorithm to insert a node at beginning in circular linked list.']
new_sequences = tokenizer.texts_to_sequences(new_questions)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_len, padding='post')
predictions = model.predict(new_padded_sequences)
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))
print(predicted_labels)

['remember' 'create']


## FastText Implementation Trial.

In [None]:
import numpy as np
import pandas as pd
import fasttext
from tensorflow import keras
from keras.layers import Input, LSTM, Dense, Bidirectional, SimpleRNN
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
# Load the data
df = pd.read_csv("/kaggle/input/questionanalyserdataset/QuestionAnalyserDataset.csv")
df = df[['Questions','BloomsTaxClass']]

In [None]:
# Tokenize the questions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Questions'])
sequences = tokenizer.texts_to_sequences(df['Questions'])
max_len = max([len(seq) for seq in sequences])

In [None]:
# Pad the sequences to a fixed length
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Encode the labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['BloomsTaxClass'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [None]:
# Load the FastText pre-trained word embeddings
ft_model = fasttext.load_model('cc.en.300.bin')

# Generate word embeddings for each question in the dataset
embedding_dim = ft_model.get_dimension()
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_matrix[i] = ft_model[word]

In [None]:
# Define the model architecture
inputs = Input(shape=(max_len,))
x = keras.layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(inputs)
x = Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2))(x)
outputs = Dense(50, activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model..summary()

In [None]:
# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=64)

In [None]:
# Evaluate the model
score, acc = model.evaluate(X_test, y_test, verbose=2)
print("Validation accuracy:", acc)

In [None]:
# Use the model to make predictions on new questions
new_questions = ['What types of programming languages are vulnerable to buffer overflows?', 'Construct the Binary Search Tree using following data. Show each steps. 32, 45, 12, 11, 13, 92, 78, 66, 17, 70,98, 108. Show its Preorder, Inorder and Postorder traversing sequences.']
new_sequences = tokenizer.texts_to_sequences(new_questions)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_len, padding='post')
predictions = model.predict(new_padded_sequences)
predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))
print(predicted_labels)

In [1]:
import numpy as np
import pandas as pd
from tensorflow import keras
from keras.layers import Input, Embedding, LSTM, Dense, Bidirectional, SimpleRNN
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold

# Load the data
df = pd.read_csv("QuestionAnalyserDataset.csv")
df = df[['Questions','Blooms Taxonomy']]

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Questions'])
sequences = tokenizer.texts_to_sequences(df['Questions'])
max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
vocab_size = len(tokenizer.word_index) + 1

# Encode the labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['Blooms Taxonomy'])

inputs = Input(shape=(max_len,))
x = Embedding(vocab_size, 128, input_length=max_len)(inputs)
x = Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2))(x)
outputs = Dense(50, activation='sigmoid')(x)
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

# Define k-fold cross validation
k = 5
kfold = KFold(n_splits=k, shuffle=True)

# Perform k-fold cross validation
test_losses = []
test_accuracies = []
for i, (train_idx, test_idx) in enumerate(kfold.split(padded_sequences)):
    X_train, X_test = padded_sequences[train_idx], padded_sequences[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]

    # Compile the model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Train the model
    print('Fold:', i+1)
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=64, verbose=1)

    # Evaluate the model on the test set
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
    test_losses.append(test_loss)
    test_accuracies.append(test_acc)

# Print the average test loss and accuracy across all folds
print('Average test loss:', np.mean(test_losses))
print('Average test accuracy:', np.mean(test_accuracies))