In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [10]:
df = pd.read_csv(r'C:\Users\Sanket\Desktop/Job_project/Assig1 redo sutram/dataset/classification_dataset.csv')

In [12]:
# Parameters
vocab_size = 5000
max_length = 20  # max words per text snippet
oov_token = "<OOV>"

In [14]:
# Tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(df['text'])

In [16]:

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(df['text'])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

In [18]:
# Encode labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(df['label'])
labels_categorical = to_categorical(labels_encoded)

In [20]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, labels_categorical, test_size=0.2, random_state=42)

In [22]:
# Info for later
print(f"Vocabulary size: {len(tokenizer.word_index)}")
print(f"Label mapping: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Vocabulary size: 319
Label mapping: {'English': 0, 'History': 1, 'Math': 2, 'Science': 3}
Train shape: (64, 20), Test shape: (16, 20)


In [24]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam

In [26]:
# Define model parameters
embedding_dim = 64
rnn_units = 64
num_classes = y_train.shape[1]

In [30]:
# Build the RNN model
model = Sequential([
    Embedding(input_dim=5000, output_dim=embedding_dim),
    SimpleRNN(rnn_units),
    Dense(num_classes, activation='softmax')
])

In [32]:
# Compile model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [34]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_split=0.2, batch_size=16)


Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 575ms/step - accuracy: 0.2624 - loss: 1.3768 - val_accuracy: 0.2308 - val_loss: 1.3718
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 133ms/step - accuracy: 0.5702 - loss: 1.2121 - val_accuracy: 0.3077 - val_loss: 1.3361
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 132ms/step - accuracy: 0.8038 - loss: 1.0089 - val_accuracy: 0.2308 - val_loss: 1.2845
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step - accuracy: 0.8659 - loss: 0.7698 - val_accuracy: 0.4615 - val_loss: 1.2214
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 157ms/step - accuracy: 0.8597 - loss: 0.5724 - val_accuracy: 0.6154 - val_loss: 1.1609
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 126ms/step - accuracy: 0.9697 - loss: 0.4615 - val_accuracy: 0.5385 - val_loss: 1.1533
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━

In [35]:
# Evaluate on test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"\n✅ Test Accuracy: {test_accuracy:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step - accuracy: 0.5000 - loss: 1.1476

✅ Test Accuracy: 0.5000


In [42]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

# Load science corpus
with open("C:/Users/Sanket/Desktop/Job_project/Assig1 redo sutram/dataset/science_corpus.txt", "r") as f:
    corpus = f.read().lower()

# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences([corpus])[0]

# Prepare sequences of 5 words and their next word
sequence_length = 5
X = []
y = []

for i in range(sequence_length, len(sequences)):
    X.append(sequences[i-sequence_length:i])  # input sequence of 5 words
    y.append(sequences[i])  # target: next word

X = np.array(X)
y = np.array(y)

# One-hot encode the target labels
y = to_categorical(y, num_classes=len(tokenizer.word_index) + 1)

# Model input shape
print(f"X shape: {X.shape}, y shape: {y.shape}")

# Save tokenizer for later use in the model
import pickle
with open("tokenizer.pickle", "wb") as f:
    pickle.dump(tokenizer, f)


X shape: (486, 5), y shape: (486, 270)


In [46]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam

# Define model parameters
embedding_dim = 64
rnn_units = 128
sequence_length = 5  # Number of words in input sequence
vocab_size = len(tokenizer.word_index) + 1  # Size of the vocabulary (unique words)

# Build the RNN model for next word prediction
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    SimpleRNN(rnn_units),
    Dense(vocab_size, activation='softmax')  # Output layer for next word prediction
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X, y, epochs=20, batch_size=64, validation_split=0.2)

# Save the model
model.save('next_word_model.h5')


Epoch 1/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 262ms/step - accuracy: 0.0054 - loss: 5.5959 - val_accuracy: 0.0306 - val_loss: 5.5918
Epoch 2/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.1377 - loss: 5.5196 - val_accuracy: 0.0714 - val_loss: 5.5825
Epoch 3/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.1203 - loss: 5.4158 - val_accuracy: 0.0714 - val_loss: 5.5719
Epoch 4/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.0933 - loss: 5.1768 - val_accuracy: 0.0714 - val_loss: 5.7345
Epoch 5/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.0922 - loss: 4.9012 - val_accuracy: 0.0714 - val_loss: 6.0243
Epoch 6/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.0893 - loss: 4.7534 - val_accuracy: 0.0714 - val_loss: 5.9663
Epoch 7/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━



In [48]:
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the trained model and tokenizer
model = load_model('next_word_model.h5')

# Load the tokenizer (to map word indices to actual words)
import pickle
with open("tokenizer.pickle", "rb") as f:
    tokenizer = pickle.load(f)

# Function to predict the next word(s) given a seed sequence
def predict_next_words(seed_text, next_words, model, tokenizer, sequence_length):
    output_text = seed_text
    for _ in range(next_words):
        # Convert the text to sequence of integers
        sequence = tokenizer.texts_to_sequences([output_text])[0]
        
        # Pad the sequence to ensure it has the correct shape
        sequence = pad_sequences([sequence], maxlen=sequence_length, padding='pre')
        
        # Predict the next word
        predicted_probabilities = model.predict(sequence, verbose=0)
        predicted_word_index = np.argmax(predicted_probabilities)
        
        # Get the word corresponding to the predicted index
        predicted_word = tokenizer.index_word[predicted_word_index]
        
        # Append the predicted word to the output
        output_text += " " + predicted_word
        
    return output_text

# Example: Provide an initial seed text and generate the next 20 words
seed_text = "photosynthesis is the"
predicted_text = predict_next_words(seed_text, 20, model, tokenizer, sequence_length=5)
print(f"Generated Text: {predicted_text}")




Generated Text: photosynthesis is the to the which of the vibrations and as blood the the in is and and the which to in and
