<a href="https://colab.research.google.com/github/SnkhchyanV/NLP/blob/main/TextClsf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from google.colab import drive
import requests
import io
from sklearn.model_selection import train_test_split

import gensim
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import pandas as pd
data_path = '/content/drive/MyDrive/DataSets/TextDatasets/subjects-questions.csv'


data = pd.read_csv(data_path)
data.head()

Unnamed: 0,eng,Subject
0,An anti-forest measure is\nA. Afforestation\nB...,Biology
1,"Among the following organic acids, the acid pr...",Chemistry
2,If the area of two similar triangles are equal...,Maths
3,"In recent year, there has been a growing\nconc...",Biology
4,Which of the following statement\nregarding tr...,Physics


In [4]:
labels = pd.get_dummies(data.Subject)
data = pd.concat([data.eng, labels],axis=1)
data.head()

Unnamed: 0,eng,Biology,Chemistry,Maths,Physics
0,An anti-forest measure is\nA. Afforestation\nB...,1,0,0,0
1,"Among the following organic acids, the acid pr...",0,1,0,0
2,If the area of two similar triangles are equal...,0,0,1,0
3,"In recent year, there has been a growing\nconc...",1,0,0,0
4,Which of the following statement\nregarding tr...,0,0,0,1


In [5]:
text = data['eng'].values
labels = data[['Biology','Chemistry', 'Maths', 'Physics']].values

In [6]:
text_train, text_test, labels_train, labels_text = train_test_split(text, labels, test_size = 0.1)

In [7]:
## Text preprocesssing, make all character lower, and delete the characters in the list
def sentence_preprocessing(sentences):
    translation_table = str.maketrans('', '', "«»()+-=-,՝.․։՜՛֊՟՚\\!")
    sentences = [sentence.lower().translate(translation_table) for sentence in sentences]
    return sentences


In [8]:
sentences = sentence_preprocessing(text_train)


In [9]:
for i in range(10):
  print(sentences[i],"\t",labels_train[i])


which of the following is a mixed oxide?
a  f e_{2} o_{3} 
в  p b o_{2} 
 mathbf{c} cdot b a o_{2} 
 mathbf{d} cdot p b_{3} o_{4}  	 [0 1 0 0]
the second's hand of a watch has  6 mathrm{cm} 
length the speed of tis tip and magnitude of difference in velocity of its at any two perpendicular positions will be respectively:
 mathbf{a} cdot 2 pi  and  0 mathrm{mm} / mathrm{s} 
b  2 sqrt{2} pi  and  444 mathrm{mm} / mathrm{s} 
 mathrm{c} cdot 2 sqrt{2} pi  and  2 pi mathrm{mm} / mathrm{s} 
d  2 pi  and  2 sqrt{3} pi mathrm{mm} / mathrm{s}  	 [0 0 0 1]
two zinc rods are placed in contact with
dil  h n o_{3}  and conc  h n o_{3}  in two
separate containers a and b the characteristics of gases evolved in the containers are respectively:
a the gas in container a turns  k_{2} c r_{2} o_{7}  green and the gas in container b turns lead acetate black
b a  rightarrow  pungent smelling gas b  rightarrow  colourless odourless gass
 mathrm{c} cdot mathrm{a} rightarrow operatorname{inflammable} operator

In [10]:
list_of_all_words = []
for sentence in sentences:
  sentence_s = sentence.split()
  list_of_tokens = []
  for word in sentence_s:
    list_of_tokens.append(word)
  list_of_all_words.append(list_of_tokens)


In [11]:
print(list_of_all_words[:10])

[['which', 'of', 'the', 'following', 'is', 'a', 'mixed', 'oxide?', 'a', 'f', 'e_{2}', 'o_{3}', 'в', 'p', 'b', 'o_{2}', 'mathbf{c}', 'cdot', 'b', 'a', 'o_{2}', 'mathbf{d}', 'cdot', 'p', 'b_{3}', 'o_{4}'], ['the', "second's", 'hand', 'of', 'a', 'watch', 'has', '6', 'mathrm{cm}', 'length', 'the', 'speed', 'of', 'tis', 'tip', 'and', 'magnitude', 'of', 'difference', 'in', 'velocity', 'of', 'its', 'at', 'any', 'two', 'perpendicular', 'positions', 'will', 'be', 'respectively:', 'mathbf{a}', 'cdot', '2', 'pi', 'and', '0', 'mathrm{mm}', '/', 'mathrm{s}', 'b', '2', 'sqrt{2}', 'pi', 'and', '444', 'mathrm{mm}', '/', 'mathrm{s}', 'mathrm{c}', 'cdot', '2', 'sqrt{2}', 'pi', 'and', '2', 'pi', 'mathrm{mm}', '/', 'mathrm{s}', 'd', '2', 'pi', 'and', '2', 'sqrt{3}', 'pi', 'mathrm{mm}', '/', 'mathrm{s}'], ['two', 'zinc', 'rods', 'are', 'placed', 'in', 'contact', 'with', 'dil', 'h', 'n', 'o_{3}', 'and', 'conc', 'h', 'n', 'o_{3}', 'in', 'two', 'separate', 'containers', 'a', 'and', 'b', 'the', 'characteristic

In [12]:
dct = gensim.corpora.Dictionary(list_of_all_words)

word2idx = dct.token2id
idx2word = [dct[i] for i in range(len(dct.token2id))]
idx2word = ['PAD', 'UNK'] + idx2word  # Add a token for unknown words and padding
word2idx = {w: i for i, w in enumerate(idx2word)}

print(f'Unique words: {len(idx2word)}')
print(idx2word[:30])

with open('idx2word.txt', 'w', encoding='utf-8') as idx2word_file:
  idx2word_file.write('\n'.join(idx2word))

Unique words: 139869
['PAD', 'UNK', 'a', 'b', 'b_{3}', 'cdot', 'e_{2}', 'f', 'following', 'is', 'mathbf{c}', 'mathbf{d}', 'mixed', 'o_{2}', 'o_{3}', 'o_{4}', 'of', 'oxide?', 'p', 'the', 'which', 'в', '/', '0', '2', '444', '6', 'and', 'any', 'at']


In [13]:
print(dct)

Dictionary<139867 unique tokens: ['a', 'b', 'b_{3}', 'cdot', 'e_{2}']...>


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(vocabulary=idx2word)

# Calculate TF-IDF scores
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)

# Define a threshold
threshold = 0.5

# Filter and delete low TF-IDF words
filtered_words = [word for word, tfidf_score in zip(idx2word, tfidf_matrix.max(axis=0).toarray()[0]) if tfidf_score >= threshold]




In [15]:

print(f'Unique words: {len(filtered_words)}')
print(filtered_words[:30])

idx2word = ['PAD', 'UNK'] + filtered_words

print(f'Unique words: {len(idx2word)}')
print(idx2word[:30])


Unique words: 13256
['cdot', 'following', 'mixed', 'of', 'the', 'and', 'any', 'at', 'difference', 'hand', 'has', 'in', 'its', 'length', 'magnitude', 'perpendicular', 'pi', 'speed', 'tip', 'tis', 'two', 'velocity', 'watch', 'will', 'acetate', 'all', 'black', 'characteristics', 'colourless', 'conc']
Unique words: 13258
['PAD', 'UNK', 'cdot', 'following', 'mixed', 'of', 'the', 'and', 'any', 'at', 'difference', 'hand', 'has', 'in', 'its', 'length', 'magnitude', 'perpendicular', 'pi', 'speed', 'tip', 'tis', 'two', 'velocity', 'watch', 'will', 'acetate', 'all', 'black', 'characteristics']


In [16]:
VOCAB_SIZE = 13185
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(idx2word)

In [17]:
print(sentences[1])
encoded_example = encoder(sentences[1])
print(encoded_example)

the second's hand of a watch has  6 mathrm{cm} 
length the speed of tis tip and magnitude of difference in velocity of its at any two perpendicular positions will be respectively:
 mathbf{a} cdot 2 pi  and  0 mathrm{mm} / mathrm{s} 
b  2 sqrt{2} pi  and  444 mathrm{mm} / mathrm{s} 
 mathrm{c} cdot 2 sqrt{2} pi  and  2 pi mathrm{mm} / mathrm{s} 
d  2 pi  and  2 sqrt{3} pi mathrm{mm} / mathrm{s} 
tf.Tensor(
[ 1708  2707  7161  4632 11665   848  7139     1     1  5920  1708  2293
  4632  1542  1544 11172  5627  4632  8851  6673  1011  4632  6338 10925
 11088  1264  4148     1   756     1  3044     1 10081     1  4041 11172
     1     1     1     1     1     1  4041 11172     1     1     1     1
 10081     1     1  4041 11172     1  4041     1     1     1     1  4041
 11172     1     1  4041     1     1], shape=(66,), dtype=int64)


In [18]:
print(encoder.get_vocabulary())

['', '[UNK]', 'question', '高广', '田田', '彥光', 'ルiva', 'กก', '२ᅵ', 'яа', 'чuestion', 'уочт', 'уочr', 'уоиг', 'туре', 'те', 'твр', 'ст', 'сс', 'сосс', 'соснс', 'сосн', 'соос', 'соон', 'соок', 'со', 'снон', 'сно', 'сн', 'см', 'си', 'сд', 'сансо', 'сам', 'сlсоон', 'сl', 'сfc', 'рос', 'рнз', 'рнву', 'ра', 'пт', 'ос', 'он', 'ова', 'ов', 'о931цн931ин', 'ну', 'нсоосн', 'нсоон', 'нсно', 'нсн', 'нс', 'нр', 'нос', 'нопо', 'но', 'не', 'нвт', 'нz', 'нp', 'нgон', 'нg', 'мпо4', 'мп', 'мо', 'мнz', 'мн', 'ми', 'меон', 'мв', 'мауbе', 'мау', 'маон', 'маdр', 'ма', 'мn', 'мineral', 'мgмg', 'мg', 'мd', 'ку', 'ксl', 'кон', 'кмпо', 'ка', 'кrf', 'кgf', 'еtон', 'еts', 'еdta', 'гос', 'го', 'вслз', 'вс', 'внс', 'вм', 'ве', 'вг', 'ва', 'вlue', 'атр', 'амр', 'ам', 'аи', 'аlме', 'аl', 'аin', 'zzx', 'zzww', 'zz', 'zymase', 'zygote', 'zy30', 'zy2', 'zxy', 'zxi', 'zx', 'zwitter', 'zw', 'zsin', 'zright', 'zrand', 'zr', 'zpi', 'zpansion', 'zones', 'zone', 'zomega', 'zns', 'zno', 'znc', 'zn', 'zlefta', 'zinç', 'zinc', 'zieg

In [19]:
model = tf.keras.Sequential([encoder, tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(4, activation = 'softmax')

])

In [20]:
import numpy as np

In [44]:
modeladdress = '/content/drive/MyDrive/SavedModels/Silsar/SilsarTextClsf'
model = tf.keras.models.load_model(modeladdress)

In [22]:
model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(0.01),
              metrics=['accuracy'])
model.fit(np.array(sentences), np.array(labels_train), epochs = 10, validation_split=0.2)
history = model.history

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
model.save(modeladdress)




In [28]:
print(text_test[1])

I.C.S.H. in male acts on
A. Cells of Leydig
B. Sertoli cells
c. Spermatids
D. spermatogonia


In [43]:
prediction = model.predict(text_test)




In [42]:
print(text_test[14])
print(np.argmax(np.array(prediction[14])))
print(np.argmax(labels_text[14]))


Two discs have same mass and
thickness. Their materials are of
densities \( \pi_{1} \) and \( \pi_{2} . \) The ratio of their
moment of inertia about central axis will be
\( \mathbf{A} \cdot \pi_{1}: \pi_{2} \)
в. \( \pi_{1} \pi_{2}: \) ।
\( \mathbf{D} \cdot \pi_{2}: \pi_{1} \)
3
3
