In [16]:
import pandas as pd
import numpy as np
import plotly.express as px
import re

In [17]:
data = pd.read_csv("subjects-questions.csv")
data.head()

Unnamed: 0,eng,Subject
0,An anti-forest measure is\nA. Afforestation\nB...,Biology
1,"Among the following organic acids, the acid pr...",Chemistry
2,If the area of two similar triangles are equal...,Maths
3,"In recent year, there has been a growing\nconc...",Biology
4,Which of the following statement\nregarding tr...,Physics


In [18]:
data.shape

(122519, 2)

In [19]:
data.Subject.unique()

array(['Biology', 'Chemistry', 'Maths', 'Physics'], dtype=object)

In [20]:
subject_dictionary = {"Biology":1,"Chemistry":2,"Maths":3,"Physics":4}
data['Subject'] = data['Subject'].map(subject_dictionary)
data.Subject.unique()

array([1, 2, 3, 4])

In [21]:
fig = px.histogram(data, x="Subject",nbins=50)
fig.show()

In [22]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [23]:
# Cleaning the text data

In [24]:
def text_cleaning():
    data['eng'] = data['eng'].str.lower()
    stop = stopwords.words('english')
    data['eng'] = data['eng'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    data['eng'] = data['eng'].str.replace(r'[^\w\s]+', '')
    data['eng'] = data['eng'].replace('[^a-zA-Z0-9 ]', '', regex=True)
    stemmer = PorterStemmer()
    data['eng'] = data['eng'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in stop]).lower())
    return data

In [25]:
text_cleaning()


The default value of regex will change from True to False in a future version.



Unnamed: 0,eng,Subject
0,antiforest measur afforest b select graze c cl...,1
1,among follow organ acid acid present rancid bu...,2
2,area two similar triangl equal equilater b iso...,3
3,recent year grow concern gradual increas avera...,1
4,follow statement regard transform incorrect tr...,4
...,...,...
122514,follow group charact present chordat stage lif...,1
122515,light year light emit sun one year b time take...,4
122516,member dipnoi nativ india b africa australia c...,1
122517,one averagelif half activ nuclei decay b less ...,4


In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer


In [31]:
count_vec = CountVectorizer()
text = count_vec.fit_transform(data['eng'])
sub = data['Subject']


In [32]:


X_train, X_test, y_train, y_test = train_test_split(text, sub, test_size=0.35)

clf = RandomForestClassifier()

model = clf.fit(X_train, y_train)


print(classification_report(y_test, model.predict(X_test)))
print(confusion_matrix(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           1       0.93      0.74      0.82      4566
           2       0.86      0.93      0.89     13226
           3       0.95      0.96      0.96     11713
           4       0.93      0.91      0.92     13377

    accuracy                           0.91     42882
   macro avg       0.92      0.88      0.90     42882
weighted avg       0.91      0.91      0.91     42882

[[ 3376   906   127   157]
 [  188 12276   181   581]
 [   10   182 11301   220]
 [   53   846   344 12134]]
