In [1]:
import spacy
import pandas as pd
from spacy.training import Example
from spacy.tokens import DocBin
#import random

In [2]:
# Load the Spacy English language model
nlp = spacy.blank("en")

In [3]:
# Load the dataset
data = pd.read_csv('Dataset2.csv', sep='\t')
data.head()

Unnamed: 0,Topic,Question
0,Interest,Find the SI for a sum of Rs.1200 at 5% pa for ...
1,Interest,"If Rs.4000 becomes Rs.4560 in 2 years time, wh..."
2,Interest,What principal would amount to Rs.8880 in 4 yr...
3,Interest,What sum would amount to Rs.26620 in 3 years a...
4,Interest,If a certain sum doubles itself in 3 yrs under...


In [4]:
# Define the categories
categories = data['Topic'].unique()

In [5]:
# Create the text categorizer pipe for the Spacy model
textcat = nlp.add_pipe("textcat")

In [6]:
# Add the categories to the text categorizer
for category in categories:
    textcat.add_label(category)

In [7]:
#Train-test splitting Data
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2)

In [8]:
# Convert the training data to binary format
doc_bin = DocBin()
for i, row in data.iterrows():
    text = row['Question']
    labels = {category: False for category in categories}
    labels[row['Topic']] = True
    example = Example.from_dict(nlp.make_doc(text), {"cats": labels})
    doc_bin.add(example.reference)

In [9]:
# Train the text categorizer
from spacy.util import minibatch, compounding
train_data = list(doc_bin.get_docs(nlp.vocab))
train_examples = []
for doc in train_data:
    labels = doc.cats
    example = Example.from_dict(doc, {"cats": labels})
    train_examples.append(example)

In [10]:
n_iter = 10
batch_size = 4
learn_rate = 0.001
dropout = 0.2
optimizer = nlp.initialize(lambda: train_examples)
for i in range(n_iter):
    # random.shuffle(train_examples)
    losses = {}
    batches = minibatch(train_examples, size =compounding(batch_size, 32, 1.001))
    for batch in batches:
        nlp.update(batch, sgd=optimizer, drop=dropout, losses=losses)
    print("Iteration:", i, "Loss:", losses)

Iteration: 0 Loss: {'textcat': 24.101316671945824}
Iteration: 1 Loss: {'textcat': 20.869784751361294}
Iteration: 2 Loss: {'textcat': 16.08329890866264}
Iteration: 3 Loss: {'textcat': 11.779037682756563}
Iteration: 4 Loss: {'textcat': 7.708939827541291}
Iteration: 5 Loss: {'textcat': 6.1071826527754745}
Iteration: 6 Loss: {'textcat': 4.097899364912303}
Iteration: 7 Loss: {'textcat': 3.309834903419296}
Iteration: 8 Loss: {'textcat': 2.666326805570109}
Iteration: 9 Loss: {'textcat': 2.001630595595283}


In [18]:
#Testing the text-categorizer
test_data, test_labels = [], []
for i, row in test.iterrows():
    test_data.append(row["Question"])
    test_labels.append(row['Topic'])

In [19]:
predicted = []
for text in test_data:
    doc = nlp(text)
    predicted.append(max(doc.cats, key=doc.cats.get))

In [22]:
#Evaluation: Accuracy
score = 0
for actual, predict in zip(test_labels, predicted):
    #print(actual, predict)
    if actual == predict:
        score += 1
print(score / len(test_labels))

0.9551569506726457


In [10]:
# Test the text categorizer
test_data = ["A man rides his bicycle for 45 min to cover a distance of 21 km. Find the speed at which he drives his bicycle ?", "What is the highest mountain in the world?"]
for text in test_data:
    doc = nlp(text)
    print(text, doc.cats)

A man rides his bicycle for 45 min to cover a distance of 21 km. Find the speed at which he drives his bicycle ? {'Interest': 1.351735257060227e-08, 'Time and Distance': 0.9999959468841553, 'Time and Work': 3.957003173127305e-06, 'Profit & Loss': 9.940904766025938e-10, 'Partnerships': 1.230486823544652e-09, 'Averages': 7.564062798337545e-08, 'Ratios & Proportions': 5.551119119928671e-09, 'Probability': 2.9690188085851332e-08, 'Ages': 6.943558922278825e-11, 'Permutations & Combinations': 8.87945006411428e-09}
What is the highest mountain in the world? {'Interest': 0.09256020188331604, 'Time and Distance': 5.7360184655408375e-06, 'Time and Work': 0.020992867648601532, 'Profit & Loss': 0.3195217549800873, 'Partnerships': 1.347855118183361e-06, 'Averages': 8.777660696068779e-06, 'Ratios & Proportions': 0.01123235933482647, 'Probability': 0.5501468181610107, 'Ages': 0.005297920200973749, 'Permutations & Combinations': 0.00023231450177263469}


In [None]:
test_data = ["A man rides his bicycle for 45 min to cover a distance of 21 km. Find the speed at which he drives his bicycle ?", "From a group of 7 men and 6 women, five persons are to be selected to form a committee so that at least 3 men are there on the committee. In how many ways can it be done?"]
for idx, text in enumerate(test_data):
    doc = nlp(text)
    print(f"Question-{idx+1}: {text}")
    print()
    pred_topic = max(doc.cats, key=doc.cats.get)
    print(f"Prediction: {pred_topic}")
    print()
    print("="*150)
    print()