In [None]:
!pip install np_utils

In [None]:
import numpy as np
import re
from keras.layers import Dense, LSTM, Input, Embedding, Dropout
from keras.models import Model, Sequential
from keras.optimizers import RMSprop
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import LambdaCallback
#import np_utils
from keras.models import load_model, save_model
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

In [None]:
load_saved_model = False
train_model = True
seq_length = 20
filename_charachters = "/kaggle/input/ascii-printable/ascii_printable_characters.txt"
filename = "/kaggle/input/1m-set/passwords.txt" 


In [None]:
with open(filename_charachters, encoding='utf-8') as f:
    text_chars = f.read().strip()

chars = sorted(list(set(text_chars)))
total_chars = len(chars)
print("Unique characters in the text:", total_chars)

char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))


In [None]:
with open(filename, encoding='utf-8') as f:
    text = f.read().strip()

In [None]:
def remove_non_ascii_printable(file_path):
    """
    Remove non-ASCII printable characters from a text file.

    :param file_path: Path to the text file to be processed.
    :return: True if the file was processed successfully, False otherwise.
    """
    try:
        with open(file_path, 'r') as file:
            text = file.read()

        # Filter out non-ASCII printable characters (ASCII range: 32 to 126 inclusive)
        filtered_text = ''.join(char for char in text if 32 <= ord(char) <= 126)

        with open(file_path, 'w') as file:
            file.write(filtered_text)

        return True
    except Exception as e:
        print(f"An error occurred: {e}")
        return False

# Example usage:
result = remove_non_ascii_printable(filename)
if result:
    print("Non-ASCII printable characters have been removed.")
else:
    print("An error occurred during processing.")



In [None]:
with open(filename, encoding='utf-8') as f:
    text = f.read().strip()

In [None]:
X = np.zeros((len(text) - seq_length, seq_length, total_chars), dtype=bool)
y = np.zeros((len(text) - seq_length, total_chars), dtype=bool)

for i in range(len(text) - seq_length):
    for t, char in enumerate(text[i:i + seq_length]):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[text[i + seq_length]]] = 1


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# Assuming 'X' and 'y' are already defined as per your provided code

# Define the split size for training and validation sets
train_size = 0.8  # 80% for training

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=train_size, random_state=42)

# Now, X_train and y_train are your training set
# X_val and y_val are your validation set


In [None]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_text(length, diversity):
    start_index = np.random.randint(0, len(text) - seq_length - 1)
    sentence = text[start_index: start_index + seq_length]
    generated = sentence
    for i in range(length):
        x_pred = np.zeros((1, seq_length, total_chars))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        sentence = sentence[1:] + next_char
        generated += next_char
    return generated


In [None]:
if load_saved_model:
    model = load_model('/content/drive/MyDrive/model-saveLSTM.h5')
else:
    model = Sequential()
    model.add(LSTM(256, input_shape=(seq_length, total_chars), return_sequences=True))
    model.add(LSTM(512, return_sequences=True))
    model.add(LSTM(512))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(total_chars, activation='softmax'))
    optimizer = Adam(learning_rate=0.002)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)

model.summary()

In [None]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Early Stopping
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=5,          # Number of epochs with no improvement after which training will be stopped
    verbose=1,
    restore_best_weights=True  # Restores model weights from the epoch with the best value of the monitored quantity.
)

# Learning Rate Scheduling
lr_scheduler = ReduceLROnPlateau(
    monitor='val_loss',  # Monitor validation loss
    factor=0.97,          # Factor by which the learning rate will be reduced. new_lr = lr * factor
    patience=3,          # Number of epochs with no improvement after which learning rate will be reduced.
    verbose=1,
    min_lr=0.0001        # Lower bound on the learning rate.
)

# Model training
history = model.fit(
    X, y,
    epochs=50,  # Total number of epochs
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping, lr_scheduler]  # Add callbacks to training
)


In [None]:
if train_model:
    history = model.fit(X, y, batch_size=32, epochs=35)
    
    model_save_path = '/kaggle/working/universal-ascii-model.h5'
    model.save(model_save_path)
    print(f"Model saved to {model_save_path}")

    plt.plot(history.history['loss'])
    plt.title('Model Loss Over Epochs')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    loss_plot_path = '/kaggle/working/loss.png'
    plt.savefig(loss_plot_path)
    print(f"Loss plot saved to {loss_plot_path}")
    plt.show()


In [None]:
generated_text = generate_text(200, 4)
print(generated_text)

with open('LSTM-output.txt', 'w') as file:
    file.write(generate_text(1000, 0.5))
