In [1]:
import pandas as pd

df = pd.read_csv('/content/collected_dataset.csv')
print(df.shape)

df.head()

(1200, 2)


Unnamed: 0,QUESTION,BT LEVEL
0,"Suppose prices of two goods are constant, expl...",COMPREHENSION
1,Explain the concept of price leadership observ...,COMPREHENSION
2,Define profit. Briefly explain how accounting ...,COMPREHENSION
3,Describe the assumptions of monopolistic compe...,COMPREHENSION
4,Explain the meaning of the law of diminishing ...,COMPREHENSION


In [2]:
df['BT LEVEL'].value_counts()

Unnamed: 0_level_0,count
BT LEVEL,Unnamed: 1_level_1
COMPREHENSION,669
KNOWLEDGE,149
EVALUATION,107
APPLICATION,100
ANALYSIS,99
SYNTHESIS,76


In [3]:
min_samples = 76


df_comprehension = df[df["BT LEVEL"]=="COMPREHENSION"].sample(min_samples, random_state=2022)
df_knowledge = df[df["BT LEVEL"]=="KNOWLEDGE"].sample(min_samples, random_state=2022)
df_evaluation = df[df["BT LEVEL"]=="EVALUATION"].sample(min_samples, random_state=2022)
df_application = df[df["BT LEVEL"]=="APPLICATION"].sample(min_samples, random_state=2022)
df_analysis = df[df["BT LEVEL"]=="ANALYSIS"].sample(min_samples, random_state=2022)
df_synthesis = df[df["BT LEVEL"]=="SYNTHESIS"].sample(min_samples, random_state=2022)

In [4]:
df_balanced = pd.concat([df_comprehension, df_knowledge, df_evaluation, df_application, df_analysis, df_synthesis], axis=0)
df_balanced["BT LEVEL"].value_counts()

Unnamed: 0_level_0,count
BT LEVEL,Unnamed: 1_level_1
COMPREHENSION,76
KNOWLEDGE,76
EVALUATION,76
APPLICATION,76
ANALYSIS,76
SYNTHESIS,76


In [5]:
df_balanced.head()

Unnamed: 0,QUESTION,BT LEVEL
780,Elaborate on the design and production process...,COMPREHENSION
947,Enzymes are applied in the medical field based...,COMPREHENSION
94,Define and explain the THREE (3) steps of brin...,COMPREHENSION
908,Draw a table indicating purification fold and ...,COMPREHENSION
136,Banks act as an important institution in the f...,COMPREHENSION


In [6]:
df_balanced['label_num'] = df_balanced['BT LEVEL'].map({
    "COMPREHENSION": 0,
    "KNOWLEDGE": 1,
    "EVALUATION": 2,
    "APPLICATION": 3,
    "ANALYSIS": 4,
    "SYNTHESIS": 5

})

#checking the results
df_balanced.head(5)

Unnamed: 0,QUESTION,BT LEVEL,label_num
780,Elaborate on the design and production process...,COMPREHENSION,0
947,Enzymes are applied in the medical field based...,COMPREHENSION,0
94,Define and explain the THREE (3) steps of brin...,COMPREHENSION,0
908,Draw a table indicating purification fold and ...,COMPREHENSION,0
136,Banks act as an important institution in the f...,COMPREHENSION,0


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced['QUESTION'],
    df_balanced['label_num'],
    test_size=0.2,
    random_state=2022,
    stratify=df_balanced.label_num
)

In [8]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (364,)
Shape of X_test:  (92,)


In [9]:
X_train.head()

Unnamed: 0,QUESTION
638,State the THREE (3) approaches to calculate na...
115,Illustrate the effect of an expansionary monet...
1107,State what does the symbol “+” represents.
1198,Compare and contrast continuous and discontinu...
1004,Determine which Angus bull is the heaviest. Ju...


In [10]:
y_train.value_counts()

Unnamed: 0_level_0,count
label_num,Unnamed: 1_level_1
1,61
2,61
0,61
3,61
4,60
5,60


In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),        #using the ngram_range parameter
     ('Multi NB', MultinomialNB())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.47      0.56        15
           1       0.90      0.60      0.72        15
           2       0.74      0.93      0.82        15
           3       0.67      0.80      0.73        15
           4       0.62      0.62      0.62        16
           5       0.58      0.69      0.63        16

    accuracy                           0.68        92
   macro avg       0.70      0.69      0.68        92
weighted avg       0.70      0.68      0.68        92



In [12]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 2))),        #using the ngram_range parameter
     ('Multi NB', MultinomialNB())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.53      0.57        15
           1       0.90      0.60      0.72        15
           2       0.79      1.00      0.88        15
           3       0.71      0.80      0.75        15
           4       0.62      0.62      0.62        16
           5       0.71      0.75      0.73        16

    accuracy                           0.72        92
   macro avg       0.72      0.72      0.71        92
weighted avg       0.72      0.72      0.71        92



In [13]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_1_3_grams', CountVectorizer(ngram_range = (1, 3))),        #using the ngram_range parameter
     ('Multi NB', MultinomialNB())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.47      0.54        15
           1       0.90      0.60      0.72        15
           2       0.75      1.00      0.86        15
           3       0.60      0.80      0.69        15
           4       0.71      0.62      0.67        16
           5       0.65      0.69      0.67        16

    accuracy                           0.70        92
   macro avg       0.71      0.70      0.69        92
weighted avg       0.71      0.70      0.69        92



In [14]:
### utlity function for pre-processing the text
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

In [15]:
df_balanced['preprocessed_question'] = df_balanced['QUESTION'].apply(preprocess)

In [16]:
df_balanced.head()

Unnamed: 0,QUESTION,BT LEVEL,label_num,preprocessed_question
780,Elaborate on the design and production process...,COMPREHENSION,0,elaborate design production process include ra...
947,Enzymes are applied in the medical field based...,COMPREHENSION,0,enzyme apply medical field base specific funct...
94,Define and explain the THREE (3) steps of brin...,COMPREHENSION,0,define explain 3 step bring new security issue...
908,Draw a table indicating purification fold and ...,COMPREHENSION,0,draw table indicate purification fold yield step
136,Banks act as an important institution in the f...,COMPREHENSION,0,bank act important institution financial syste...


In [17]:
df_balanced.QUESTION[780]

'Elaborate on the design and production process including the rational and economic use of resources that influence the development of a building. '

In [18]:
df_balanced.preprocessed_question[780]

'elaborate design production process include rational economic use resource influence development building'

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.preprocessed_question,
    df_balanced.label_num,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df_balanced.label_num
)

In [20]:
#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),        #using the ngram_range parameter
     ('Multi NB', MultinomialNB())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.60      0.69        15
           1       0.81      0.87      0.84        15
           2       0.86      0.80      0.83        15
           3       0.65      0.73      0.69        15
           4       0.56      0.56      0.56        16
           5       0.67      0.75      0.71        16

    accuracy                           0.72        92
   macro avg       0.73      0.72      0.72        92
weighted avg       0.72      0.72      0.72        92



In [21]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range = (1, 2))),        #using the ngram_range parameter
    ('Multi NB', MultinomialNB())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.67      0.74        15
           1       0.92      0.73      0.81        15
           2       0.72      0.87      0.79        15
           3       0.67      0.67      0.67        15
           4       0.53      0.56      0.55        16
           5       0.56      0.62      0.59        16

    accuracy                           0.68        92
   macro avg       0.70      0.69      0.69        92
weighted avg       0.70      0.68      0.69        92



In [22]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_1_3_grams', CountVectorizer(ngram_range = (1, 3))),        #using the ngram_range parameter
     ('Multi NB', MultinomialNB())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.67      0.74        15
           1       0.92      0.73      0.81        15
           2       0.72      0.87      0.79        15
           3       0.67      0.67      0.67        15
           4       0.53      0.56      0.55        16
           5       0.56      0.62      0.59        16

    accuracy                           0.68        92
   macro avg       0.70      0.69      0.69        92
weighted avg       0.70      0.68      0.69        92

