In [1]:
pip install tensorflow pandas scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load the dataset
data = pd.read_csv('encrypted_file.csv')

# Fill NaN values with an empty string or some default text
data['encrypted_hypothesis'] = data['encrypted_hypothesis'].fillna('')

# Ensure all values are strings (this also handles non-string types like floats)
data['encrypted_hypothesis'] = data['encrypted_hypothesis'].astype(str)

# Extract the ciphered text and encryption method columns
X = data['encrypted_hypothesis'].values
y = data['encryption_method'].values

# Encode the labels (encryption methods)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Tokenize the ciphered text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

# Determine the maximum sequence length dynamically
max_sequence_length = max([len(seq) for seq in X_sequences])

# Pad the sequences to ensure uniform input size
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

# Get the vocabulary size for the input layer
vocab_size = len(tokenizer.word_index) + 1

# Create the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')  # Number of unique encryption methods
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy*100:.2f}%')

# Make predictions (optional)
predictions = model.predict(X_test[:5])
predicted_labels = label_encoder.inverse_transform(predictions.argmax(axis=1))

print(f'Predicted: {predicted_labels}')
print(f'Actual: {label_encoder.inverse_transform(y_test[:5])}')




Epoch 1/5




[1m6877/6877[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2576s[0m 374ms/step - accuracy: 0.6053 - loss: 0.8159 - val_accuracy: 0.6227 - val_loss: 0.7526
Epoch 2/5
[1m6877/6877[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2576s[0m 375ms/step - accuracy: 0.7372 - loss: 0.5095 - val_accuracy: 0.5761 - val_loss: 0.9733
Epoch 3/5
[1m6877/6877[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2563s[0m 373ms/step - accuracy: 0.9804 - loss: 0.0553 - val_accuracy: 0.5149 - val_loss: 2.8145
Epoch 4/5
[1m6877/6877[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2582s[0m 375ms/step - accuracy: 0.9995 - loss: 0.0018 - val_accuracy: 0.5148 - val_loss: 2.9404
Epoch 5/5
[1m6877/6877[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2558s[0m 372ms/step - accuracy: 0.9999 - loss: 2.6375e-04 - val_accuracy: 0.5159 - val_loss: 3.3413
[1m3439/3439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.5144 - loss: 3.3647
Test Accuracy: 51.59%
[1m1/1[0m [32m━━━━━━━━━━━━━━━