In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from scipy.stats import entropy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from scipy.sparse import hstack

In [None]:
df = pd.read_csv("/content/updated_large_meaningful_cipher_dataset.csv")

In [None]:
def calc_entropy(text):
    if not isinstance(text, str) or len(text) == 0:
        return 0
    counts = Counter(text)
    probabilities = np.array(list(counts.values())) / len(text)
    return entropy(probabilities, base=2)

df["KeyEntropy"] = df["Key"].apply(calc_entropy)
df["CipherEntropy"] = df["Text"].apply(calc_entropy)
df["KeyLength"] = df["Key"].astype(str).apply(len)
df["CipherLength"] = df["Text"].astype(str).apply(len)

In [None]:
df["KeyToCipherRatio"] = df["KeyLength"] / (df["CipherLength"] + 1)
df["EntropyRatio"] = df["KeyEntropy"] / (df["CipherEntropy"] + 1)

In [None]:
label_enc = LabelEncoder()
df["Type"] = label_enc.fit_transform(df["Type"])

In [None]:
key_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
cipher_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

In [None]:
key_features = key_vectorizer.fit_transform(df["Key"].astype(str))
cipher_features = cipher_vectorizer.fit_transform(df["Text"].astype(str))

In [None]:
extra_features = df[["KeyLength", "CipherLength", "KeyEntropy", "CipherEntropy", "KeyToCipherRatio", "EntropyRatio"]].values
y = df["Type"]

In [None]:
X = hstack([key_features, cipher_features, extra_features])
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


In [None]:
xgb_model = XGBClassifier(eval_metric='mlogloss', learning_rate=0.1, max_depth=8, n_estimators=150, use_label_encoder=False)
rf_model = RandomForestClassifier(n_estimators=150, max_depth=20, random_state=42)
lgbm_model = LGBMClassifier(n_estimators=150, learning_rate=0.1, max_depth=8, random_state=42)

In [None]:
voting_clf = VotingClassifier(
    estimators=[('xgb', xgb_model), ('rf', rf_model), ('lgbm', lgbm_model)],
    voting='soft',
    weights=[3, 1, 3]  # XGB and LGBM more influence
)

In [None]:
voting_clf.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.246482 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1803
[LightGBM] [Info] Number of data points in the train set: 144000, number of used features: 319
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759


In [None]:
voting_predictions = voting_clf.predict(X_test)
voting_accuracy = accuracy_score(y_test, voting_predictions)

print(f"\n🔍 Voting Ensemble Accuracy: {voting_accuracy * 100:.2f}%\n")




🔍 Voting Ensemble Accuracy: 89.54%



In [None]:
from sklearn.metrics import classification_report

xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
print("XGBoost Classification Report:")
print(classification_report(y_test, xgb_preds, target_names=label_enc.classes_))


rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_preds, target_names=label_enc.classes_))

lgbm_model.fit(X_train, y_train)
lgbm_preds = lgbm_model.predict(X_test)
print("LightGBM Classification Report:")
print(classification_report(y_test, lgbm_preds, target_names=label_enc.classes_))

Parameters: { "use_label_encoder" } are not used.



XGBoost Classification Report:
              precision    recall  f1-score   support

        3DES       0.67      0.92      0.77      6000
         AES       0.81      0.56      0.66      6000
    ChaCha20       0.98      0.95      0.96      6000
         DES       0.99      1.00      0.99      6000
   Plaintext       1.00      1.00      1.00      6000
         RC4       1.00      0.93      0.97      6000

    accuracy                           0.90     36000
   macro avg       0.91      0.90      0.89     36000
weighted avg       0.91      0.90      0.89     36000

Random Forest Classification Report:
              precision    recall  f1-score   support

        3DES       0.49      0.68      0.57      6000
         AES       0.42      0.05      0.09      6000
    ChaCha20       0.63      0.95      0.76      6000
         DES       0.75      1.00      0.86      6000
   Plaintext       1.00      1.00      1.00      6000
         RC4       0.78      0.51      0.62      6000

    accur



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.243650 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1803
[LightGBM] [Info] Number of data points in the train set: 144000, number of used features: 319
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759




LightGBM Classification Report:
              precision    recall  f1-score   support

        3DES       0.68      0.88      0.77      6000
         AES       0.78      0.60      0.68      6000
    ChaCha20       0.98      0.95      0.96      6000
         DES       0.99      1.00      0.99      6000
   Plaintext       1.00      1.00      1.00      6000
         RC4       1.00      0.93      0.97      6000

    accuracy                           0.89     36000
   macro avg       0.90      0.89      0.89     36000
weighted avg       0.90      0.89      0.89     36000



In [None]:
def predict_algorithm(ciphertext, key):
    # TF-IDF features
    key_input = key_vectorizer.transform([key])
    cipher_input = cipher_vectorizer.transform([ciphertext])

    # Numeric feature extraction
    key_length = len(key)
    cipher_length = len(ciphertext)
    key_entropy = calc_entropy(key)
    cipher_entropy = calc_entropy(ciphertext)
    key_to_cipher_ratio = key_length / cipher_length if cipher_length != 0 else 0
    entropy_ratio = key_entropy / cipher_entropy if cipher_entropy != 0 else 0

    # Show feature values
    print("\n📊 Feature Breakdown:")
    print(f"🔑 Key Length:            {key_length}")
    print(f"🔒 Ciphertext Length:     {cipher_length}")
    print(f"🔑 Key Entropy:           {key_entropy:.4f}")
    print(f"🔒 Ciphertext Entropy:    {cipher_entropy:.4f}")
    print(f"📏 Key/Cipher Ratio:      {key_to_cipher_ratio:.4f}")
    print(f"📏 Entropy Ratio:         {entropy_ratio:.4f}")

    # Combine all features
    meta_features = np.array([[key_length, cipher_length, key_entropy, cipher_entropy, key_to_cipher_ratio, entropy_ratio]])
    input_data = hstack([key_input, cipher_input, meta_features])

    # Predict using your ensemble model (update to your model's name if needed)
    final_probs = voting_clf.predict_proba(input_data)
    final_prediction = np.argmax(final_probs)
    predicted_label = label_enc.inverse_transform([final_prediction])[0]

    # Get crypto info
    info = crypto_info.get(predicted_label, {
        "weakness": "No known weakness documented.",
        "recommendation": "Consider using AES or ChaCha20."
    })

    print(f"\n🔐 Predicted Encryption Algorithm: {predicted_label}")
    print(f"⚠️ Detected Weakness: {info['weakness']}")
    print(f"✅ Recommended Alternative: {info['recommendation']}")

In [None]:
ciphertext = input("Enter Ciphertext: ")
key = input("Enter Key: ")
predict_algorithm(ciphertext, key)

Enter Ciphertext: MDGwc/Fr9KJ2HgRMRZfnairhQbtW1tn36R7/7EMwawtuJecBywjzA9BhdPZrHkrI
Enter Key: e33779e362ede1ee868315333518e4140396a7b86946dd6e9188f44962eb7f85

📊 Feature Breakdown:
🔑 Key Length:            64
🔒 Ciphertext Length:     64
🔑 Key Entropy:           3.6536
🔒 Ciphertext Entropy:    5.1834
📏 Key/Cipher Ratio:      1.0000
📏 Entropy Ratio:         0.7049

🔐 Predicted Encryption Algorithm: AES
⚠️ Detected Weakness: Very high entropy and fixed block size make it easy to identify. Secure, but side-channel attacks exist if improperly implemented.
✅ Recommended Alternative: Use AES-256 in GCM mode or ChaCha20 for excellent security and speed.


