Importing the Libraries and Dataset

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv("Maths_Problem_Category.csv")

Cleaning the Texts

In [3]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 250):
  problem = re.sub('[^a-zA-Z]', ' ', dataset['problem'][i])
  problem = problem.lower()
  problem = problem.split()
  ps = PorterStemmer()
  problem = [ps.stem(word) for word in problem if not word in set(stopwords.words('english'))]
  problem = ' '.join(problem)
  corpus.append(problem)

[nltk_data] Downloading package stopwords to C:\Users\Sudhanshu
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
print(corpus)

['ring toss game carniv marco throw ring ring worth point worth point marco score total point mani point ring point ring marco toss', 'interact student follow step creat exploratori area model', 'follow direct creat equival fraction denomin interact student follow step creat exploratori area model', 'solv follow system substitut x x', 'solv follow system substitut x x', 'estim quotient', 'olivia went beauti store buy p bottl nail polish bottl nail polish cost much spend total bottl nail polish write express use p', 'jeff marbl collect robert time mani marbl jeff mani marbl robert write express use', 'paxton marbl clean room found marbl bed mani total marbl paxton write express', 'estim product', 'interact student follow step creat exploratori area model', 'violet eri take surf lesson violet pay lesson per hour rent surfboard eri pay lesson per hour rent surfboard write two equat slope intercept form repres person total cost term number hour x', 'violet eri take surf lesson violet pay l

Creating a Bag of Words

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=550)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

Splitting the Dataset into Training_Set and Test_set

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

Training of Data

In [7]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 20, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

Predicting the Results 

In [8]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[12 12]
 [24 24]
 [ 6  6]
 [ 7 21]
 [ 1  1]
 [12 12]
 [ 3  3]
 [10 10]
 [ 3  3]
 [ 8  8]
 [ 5  5]
 [15 15]
 [18 18]
 [ 5  2]
 [18 18]
 [ 7  7]
 [19 19]
 [20 20]
 [ 9  9]
 [24 24]
 [21 21]
 [ 5  2]
 [20 20]
 [ 6  6]
 [ 9  9]
 [11 11]
 [ 8  8]
 [ 8  8]
 [ 4  4]
 [24 24]
 [22 22]
 [11 11]
 [14 13]
 [14 13]
 [ 1  2]
 [11 11]
 [16 16]
 [24 24]
 [ 5  5]
 [16 16]
 [10 16]
 [21 18]
 [23 23]
 [17 17]
 [ 7  7]
 [14 14]
 [14 14]
 [17 17]
 [20 20]
 [24 24]
 [24 24]
 [15 15]
 [16 16]
 [18 18]
 [23 23]
 [21 19]
 [21  7]
 [14 13]
 [ 7  7]
 [ 8  8]
 [ 9  9]
 [ 2  2]
 [10 10]]


Checking the Accuracy

In [9]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("\n \n")
print("The accuracy of the model is: ", accuracy_score(y_test, y_pred))


[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 3 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0