In [None]:
#Data Pre-process

In [1]:
import pandas as pd
import csv

In [2]:
#Load the dataset
data = []
with open('Dataset/Data.csv') as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        data.append(row[0].split('|++|'))
data = pd.DataFrame(data, columns=['id', 'id', 'subjectName', 'topicName', 'chapterName', 'levelDescription', 'Question', 'Answer']) 

In [3]:
#Drop null values
data = data.dropna()

In [4]:
#Use only Subject and Question columns
data = data.iloc[:, [False, False, True, False, False, False, True, False]]

In [5]:
#Shuffle data
data = data.sample(n=600)

In [None]:
#---------------Classifier--------------

In [None]:
import spacy
from spacy.training import Example
from spacy.tokens import DocBin

In [None]:
# Load the Spacy English language model
nlp = spacy.blank("en")

In [None]:
# Define the categories
categories = data['subjectName'].unique()

In [None]:
# Create the text categorizer pipe for the Spacy model
textcat = nlp.add_pipe("textcat")

In [None]:
# Add the categories to the text categorizer
for category in categories:
    textcat.add_label(category)

In [6]:
#train-test Split data
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2)

In [None]:
# Convert the training data to binary format
doc_bin = DocBin()
for i, row in data.iterrows():
    text = row['Question']
    labels = {category: False for category in categories}
    labels[row['subjectName']] = True
    example = Example.from_dict(nlp.make_doc(text), {"cats": labels})
    doc_bin.add(example.reference)

In [None]:
# Train the text categorizer
from spacy.util import minibatch, compounding
train_data = list(doc_bin.get_docs(nlp.vocab))
train_examples = []
for doc in train_data:
    labels = doc.cats
    example = Example.from_dict(doc, {"cats": labels})
    train_examples.append(example)

In [None]:
n_iter = 10
batch_size = 4
learn_rate = 0.001
dropout = 0.2
optimizer = nlp.initialize(lambda: train_examples)
for i in range(n_iter):
    losses = {}
    batches = minibatch(train_examples, size =compounding(batch_size, 32, 1.001))
    for batch in batches:
        nlp.update(batch, sgd=optimizer, drop=dropout, losses=losses)
    print("Iteration:", i, "Loss:", losses)

In [None]:
#Testing the text-categorizer
test_data, test_labels = [], []
for i, row in test.iterrows():
    test_data.append(row["Question"])
    test_labels.append(row['subjectName'])

In [None]:
predicted = []
for text in test_data:
    doc = nlp(text)
    predicted.append(max(doc.cats, key=doc.cats.get))

In [None]:
#Evaluation: Accuracy
score = 0
for actual, predict in zip(test_labels, predicted):
    #print(actual, predict)
    if actual == predict:
        score += 1
print(score / len(test_labels))