# AI-Powered Cybersecurity Lab Notebook
This notebook includes sections:
1. Phishing Detection
2. Behavioral Analytics / Anomaly Detection
3. Autoencoders & Deepfakes (PCA demo)
4. AI vs AI Defense Simulation
5. AI-Driven Security Operations


In [None]:
# Section 1 — AI-Powered Phishing Detection
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
import matplotlib.pyplot as plt
import numpy as np

# Dataset
data = {
    "text": [
        "Your account has been suspended, click here to verify.",
        "Please listen to the attached message carefully.",
        "Meeting at 10am, see you soon.",
        "Win a free gift card now!",
        "Your invoice is attached.",
        "System update completed successfully.",
        "Urgent: verify your account immediately.",
        "Lunch at 1pm?",
        "Attachment: payroll report",
        "Weekly newsletter - company updates."
    ],
    "label": [1, 1, 0, 1, 1, 0, 1, 0, 1, 0]  # 1 = phishing, 0 = legit
}

df = pd.DataFrame(data)

# Vectorize text
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['text'])
y = df['label']

# Train model
model = LogisticRegression()
model.fit(X, y)

# Predict & evaluate
y_pred = model.predict(X)
acc = accuracy_score(y, y_pred)
cm = confusion_matrix(y, y_pred)

print(f"Accuracy: {acc:.2f}")
ConfusionMatrixDisplay(cm).plot(cmap="Blues")
plt.title("Phishing Detection Confusion Matrix")
plt.show()

# Top features
feature_names = np.array(vectorizer.get_feature_names_out())
top_words = feature_names[np.argsort(model.coef_[0])[-3:]]
print("Top phishing-indicative words:", top_words)


In [None]:
# Section 2 — Behavioral Analytics / Anomaly Detection
from sklearn.ensemble import IsolationForest

# Synthetic login data
np.random.seed(0)
login_data = pd.DataFrame({
    "hour": np.random.randint(6, 23, 50),
    "country": np.random.choice(["US", "UK", "IN", "DE"], 50),
    "device": np.random.choice(["PC", "Mobile"], 50),
    "failed_attempts": np.random.randint(0, 3, 50)
})

# Encode categorical
login_data_enc = pd.get_dummies(login_data, drop_first=True)

# Train IsolationForest
iso = IsolationForest(contamination=0.1, random_state=0)
scores = iso.fit_predict(login_data_enc)
login_data["anomaly"] = scores

# Add suspicious records
suspicious = pd.DataFrame({
    "hour": [2, 3, 23],
    "country": ["RU", "CN", "BR"],
    "device": ["PC", "Mobile", "PC"],
    "failed_attempts": [5, 4, 6]
})
suspicious_enc = pd.get_dummies(suspicious, drop_first=True).reindex(columns=login_data_enc.columns, fill_value=0)
suspicious["anomaly"] = iso.predict(suspicious_enc)
print("Suspicious flagged:
", suspicious)

# Visualize
plt.hist(iso.decision_function(login_data_enc), bins=20, color='skyblue')
plt.title("Anomaly Scores Histogram")
plt.xlabel("Score")
plt.ylabel("Frequency")
plt.show()


In [None]:
# Section 3 — PCA Autoencoder Demo
from sklearn.decomposition import PCA
from sklearn.datasets import load_digits

digits = load_digits()
X = digits.data / 16.0

# PCA with larger bottleneck
pca_large = PCA(n_components=64)
X_recon_large = pca_large.inverse_transform(pca_large.fit_transform(X))

# PCA with smaller bottleneck
pca_small = PCA(n_components=8)
X_recon_small = pca_small.inverse_transform(pca_small.fit_transform(X))

# Display comparison
fig, axes = plt.subplots(2, 10, figsize=(10, 2))
for i in range(10):
    axes[0, i].imshow(X[i].reshape(8, 8), cmap='gray')
    axes[0, i].axis('off')
    axes[1, i].imshow(X_recon_small[i].reshape(8, 8), cmap='gray')
    axes[1, i].axis('off')
plt.suptitle("Top: Original | Bottom: Reconstructed (8 comps)")
plt.show()


In [None]:
# Section 4 — AI vs AI in Cyber Defense (Toy Simulation)
from sklearn.model_selection import train_test_split

texts = [
    "verify account now", "click to reset password", "your account locked",
    "meeting schedule", "project report ready", "lunch with team",
    "free prize claim", "update details", "important message", "please call back"
]
labels = [1, 1, 1, 0, 0, 0, 1, 1, 1, 0]

train_texts, test_texts, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42)

def mutate_phrases(texts):
    return [t.replace("account", "profile").replace("click", "tap") for t in texts]

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_texts)
model = LogisticRegression()
model.fit(X_train, y_train)

accs = []
for round in range(1, 6):
    test_texts = mutate_phrases(test_texts)
    X_test = vectorizer.transform(test_texts)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accs.append(acc)
    model.fit(vectorizer.fit_transform(train_texts + test_texts), y_train + y_test)
    print(f"Round {round} Accuracy: {acc:.2f}")

plt.plot(range(1, 6), accs, marker='o')
plt.title("Defender Accuracy Over Rounds")
plt.xlabel("Round")
plt.ylabel("Accuracy")
plt.show()


In [None]:
# Section 5 — AI-Driven Security Operations Simulation
import datetime as dt

np.random.seed(42)
days = pd.date_range("2025-01-01", periods=7)
alerts = []

for day in days:
    for user in ["alice", "bob", "charlie"]:
        failed = np.random.randint(0, 5)
        if failed >= 3:
            alerts.append({"date": day, "user": user, "failed_attempts": failed, "action": "lock_account"})

alerts_df = pd.DataFrame(alerts)
alerts_df.to_csv("alerts_log.csv", index=False)
print("Alerts logged to alerts_log.csv")

# Plot daily frequency
daily = alerts_df.groupby("date").size()
daily.plot(kind="bar", color="salmon", title="Daily Incident Frequency")
plt.ylabel("Count")
plt.show()
