In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stopwords.words('english')
import string
string.punctuation
from nltk.stem.porter import PorterStemmer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import callbacks
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,GlobalAveragePooling1D
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/content/subjects-questions.csv')
df


In [None]:
df.Subject.unique()

In [None]:
df['Subject'].value_counts()

In [None]:
df.isna().sum()


In [None]:
plt.figure(figsize = (8, 8))
sns.countplot(df['Subject'])

In [None]:
df.head()


In [None]:
category = pd.get_dummies(df.Subject)
df = pd.concat([df, category], axis = 1)
df = df.drop(columns = 'Subject')

df

In [None]:
df.head()

In [None]:
def remove_Stopwords(text ):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize( text.lower() )
    sentence = [w for w in words if not w in stop_words]
    return " ".join(sentence)


def lemmatize_text(text):
    wordlist=[]
    lemmatizer = WordNetLemmatizer()
    sentences=sent_tokenize(text)
    for sentence in sentences:
        words=word_tokenize(sentence)
        for word in words:
            wordlist.append(lemmatizer.lemmatize(word))
    return ' '.join(wordlist)

def clean_text(text ):
    delete_dict = {sp_character: '' for sp_character in string.punctuation}
    delete_dict[' '] = ' '
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr])

    return text2.lower()

def stemSentence(text):
    porter = PorterStemmer()
    token_words=word_tokenize(text)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [None]:
df['eng'] = df['eng'].apply(remove_Stopwords)
df['eng'] = df['eng'].apply(lemmatize_text)
df['eng'] = df['eng'].apply(clean_text)
df

In [None]:
df['eng'] = df['eng'].apply(stemSentence)


In [None]:
df.head()

In [None]:
length = df['eng'].str.len().max()
length

In [None]:
ques = df['eng'].values
subs = df[['Biology', 'Chemistry', 'Maths', 'Physics']].values
subs, ques

In [None]:
ques_train, ques_test, subs_train, subs_test = train_test_split(ques, subs, test_size = 0.2, random_state = 123)

In [None]:
ques_train.shape , ques_test.shape , subs_train.shape , subs_test.shape

In [None]:
tokenizer = Tokenizer(num_words = length, oov_token = '<OOV>')
tokenizer.fit_on_texts(ques_train)
tokenizer.fit_on_texts(ques_test)

vocab_size = len(tokenizer.word_index) + 1


sequences_train = tokenizer.texts_to_sequences(ques_train)
sequences_test = tokenizer.texts_to_sequences(ques_test)

padded_train = pad_sequences(sequences_train,
                             maxlen = 5,
                             padding = 'post',
                             truncating = 'post')
padded_test = pad_sequences(sequences_test,
                            maxlen = 5,
                            padding = 'post',
                            truncating = 'post')

print("Vocabulary size:", vocab_size),
padded_train

In [None]:
# model=Sequential()
# model.add(Embedding(input_dim=98015,output_dim=64))
# model.add(LSTM(64))
# model.add(Dense(64, activation='relu'))
# model.add(Dense(16, activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(8, activation=('relu')))
# model.add(Dense(4, activation=('softmax')))

model = Sequential()
# model.add(Embedding(input_dim=10000, output_dim=16),)
model.add(Embedding(input_dim=vocab_size, output_dim=100))
model.add(LSTM(128, return_sequences=True))
# model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=True))
# model.add(Dropout(0.2))
# model.add(GlobalAveragePooling1D())
model.add(LSTM(32))
model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.2))
model.add(Dense(32, activation= 'relu'))
model.add(Dense(4, activation='softmax'))

In [None]:
model.compile(loss ='categorical_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

In [None]:
model.summary()

In [None]:
from tensorflow.keras.callbacks import Callback, EarlyStopping

# class accCallback(Callback):
#    def on_epoch_end(self, epoch, logs={}):
#         if(logs.get('accuracy') >= 0.98 and logs.get('val_accuracy') >= 0.98):
#             print("\nAccuracy and Val_Accuracy has reached 90%!", "\nEpoch: ", epoch)
#             self.model.stop_training = True

# callbacks = accCallback()

earlystopping = EarlyStopping(
    monitor = 'val_accuracy',
    min_delta = 0,
    patience = 4,
    verbose = 1,
    mode = 'auto'
)

In [None]:
history = model.fit(padded_train, subs_train,steps_per_epoch = 30,epochs = 100,validation_data = (padded_test, subs_test),
                    verbose = 1,validation_steps = 50,callbacks=[earlystopping] , batch_size = 300)

In [None]:
# from keras.models import load_model
# from keras.preprocessing.sequence import pad_sequences
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
# import numpy as np

# Function to preprocess text
# def preprocess_text(text):
#     tokens = word_tokenize(text.lower())
#     stop_words = set(stopwords.words('english'))
#     tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
#     lemmatizer = WordNetLemmatizer()
#     tokens = [lemmatizer.lemmatize(word) for word in tokens]
#     return tokens

# Load the model
# model = load_model("model.h5")

# Sample custom question
custom_question = "area two similar triangl equal equilater b iso...	"

# Preprocess the custom question
# preprocessed_question = preprocess_text(custom_question)

#Start

custom_question = remove_Stopwords(custom_question)
custom_question = lemmatize_text(custom_question)
custom_question = clean_text(custom_question)
custom_question = stemSentence(custom_question)

tokenizer.fit_on_texts(custom_question)
custom_question = tokenizer.texts_to_sequences([custom_question])

padded_cq = pad_sequences(custom_question,maxlen = 5,padding = 'post',truncating = 'post')

#End

# Tokenize and pad the sequence
# sequences = tokenizer.texts_to_sequences([preprocessed_question])
# padded_sequence = pad_sequences(sequences, maxlen=5)

# Make predictions
predictions = model.predict(padded_cq)

# Decode predictions (assuming you used one-hot encoding for training)
classes = ['biology', 'chemistry', 'maths', 'physics']
 # Replace with your actual class labels
predicted_class_index = np.argmax(predictions)
predicted_class = classes[predicted_class_index]
#  classes[predicted_class_index]

# Display the results
print(f"Custom Question: {custom_question}")
print(predictions)
print(f"Predicted Class: {predicted_class}")
