In [1]:
pip install pandas numpy matplotlib seaborn plotly scikit-learn folium geoip2 tqdm openpyxl


Collecting geoip2
  Downloading geoip2-5.1.0-py3-none-any.whl.metadata (19 kB)
Collecting maxminddb<3.0.0,>=2.7.0 (from geoip2)
  Downloading maxminddb-2.8.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (5.1 kB)
Downloading geoip2-5.1.0-py3-none-any.whl (27 kB)
Downloading maxminddb-2.8.2-cp311-cp311-macosx_11_0_arm64.whl (35 kB)
Installing collected packages: maxminddb, geoip2
Successfully installed geoip2-5.1.0 maxminddb-2.8.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/anaconda3/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
# ================================================================
# AWS CloudTrail Mini CSPM Project (GitHub‑Friendly Version)
# Author: Nowshika Mirza R
# Dataset: AWS CloudTrails Dataset from flaws.cloud (Kaggle)
# Purpose: Use minimal CSV sample for demonstration & GitHub upload
# ================================================================

# Install dependencies (only if not already installed)
# !pip install pandas numpy matplotlib seaborn plotly scikit-learn tqdm

# ================================================================
# 1. Imports & Initial Config
# ================================================================
import os, json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

sns.set(style="whitegrid")

CONFIG = {
    "INPUT_CSV": "cloudtrail_logs.csv",
    "OUTPUT_DIR": "cspm_outputs",
    "ANOMALY_RANDOM_STATE": 42,
    "ISOLATION_FOREST_CONTAMINATION": 0.05
}
os.makedirs(CONFIG["OUTPUT_DIR"], exist_ok=True)

# ================================================================
# 2. Create Mini Dataset from Kaggle File
# ================================================================
# If you download the Kaggle file (dec12_18features.csv) and keep it here,
# this cell creates a small subset safe for GitHub.
FULL_FILE = "dec12_18features.csv"
SAMPLE_FILE = CONFIG["INPUT_CSV"]

if os.path.exists(FULL_FILE):
    df_small = pd.read_csv(FULL_FILE, nrows=500)  # only first 500 rows
    df_small.to_csv(SAMPLE_FILE, index=False)
    print(f"Sample created and saved as: {SAMPLE_FILE}")
else:
    print("⚠️ Kaggle full dataset not found. Please place 'dec12_18features.csv' in this folder.")

# ================================================================
# 3. Load Sample Dataset
# ================================================================
if os.path.exists(SAMPLE_FILE):
    df = pd.read_csv(SAMPLE_FILE)
else:
    df = pd.DataFrame({
        "eventTime": pd.date_range("2025-01-01", periods=50, freq="H"),
        "eventName": np.random.choice(["ConsoleLogin","CreateUser","PutBucketAcl"], 50),
        "awsRegion": np.random.choice(["us-east-1","eu-west-1"], 50),
        "sourceIPAddress": np.random.choice(["192.168.1.1","54.12.123.10"], 50),
        "errorCode": np.random.choice([None,"AccessDenied"], 50)
    })
    df.to_csv(SAMPLE_FILE, index=False)
    print(f"No Kaggle data found — demo dataset '{SAMPLE_FILE}' generated.")

print("Dataset loaded:", len(df), "rows")

# ================================================================
# 4. Rule‑Based Flags (CSPM simulation)
# ================================================================
df["suspicious_event"] = df["eventName"].isin(["ConsoleLogin","CreateUser","PutBucketAcl"])
df["failed_login"] = df["eventName"].eq("ConsoleLogin") & df["errorCode"].notna()
df["rule_score"] = df[["suspicious_event","failed_login"]].sum(axis=1)

# ================================================================
# 5. Risk Scoring & Simple Anomaly Detection
# ================================================================
df["risk_score"] = df["rule_score"]*10 + df["errorCode"].notna()*20
agg = df.groupby("sourceIPAddress").agg(event_count=("eventName","count")).reset_index()
scaler = StandardScaler()
iso = IsolationForest(contamination=CONFIG["ISOLATION_FOREST_CONTAMINATION"],
                      random_state=CONFIG["ANOMALY_RANDOM_STATE"])
agg["scaled"] = iso.fit_predict(scaler.fit_transform(agg[["event_count"]]))
agg["ip_anomaly"] = agg["scaled"]==-1
df = df.merge(agg[["sourceIPAddress","ip_anomaly"]], on="sourceIPAddress", how="left")
df["risk_score"] += df["ip_anomaly"].astype(int)*30

def classify(score):
    if score>=60: return "Critical"
    elif score>=40: return "High"
    elif score>=20: return "Medium"
    return "Low"
df["Risk_Level"] = df["risk_score"].apply(classify)

# ================================================================
# 6. Basic Reporting
# ================================================================
summary = df["Risk_Level"].value_counts().reset_index()
summary.columns = ["Risk_Level","Count"]
print(summary)

summary.to_csv(os.path.join(CONFIG["OUTPUT_DIR"],"risk_summary.csv"), index=False)
df.to_csv(os.path.join(CONFIG["OUTPUT_DIR"],"annotated_cloudtrail.csv"), index=False)

fig = px.pie(summary, names="Risk_Level", values="Count",
             title="Risk Level Distribution (Mini Sample)",
             color="Risk_Level",
             color_discrete_map={"Low":"green","Medium":"orange","High":"red","Critical":"darkred"})
fig.show()

print("\nAll outputs saved in:", CONFIG["OUTPUT_DIR"])


NameError: name 'cloudtrail_logs' is not defined

In [4]:
# ================================================================
# AWS CloudTrail CSPM Mini Project — Full GitHub‑Size Edition
# Author: Nowshika Mirza R
# ================================================================

# !pip install pandas numpy matplotlib seaborn plotly scikit-learn tqdm

# ================================================================
# 1. Setup & Imports
# ================================================================
import os, json, numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns, plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from tqdm import tqdm

sns.set(style="whitegrid")

CONFIG = {
    "INPUT_CSV": "cloudtrail_logs_sample.csv",
    "OUTPUT_DIR": "cspm_outputs",
    "ISOLATION_FOREST_CONTAMINATION": 0.05,
    "ANOMALY_RANDOM_STATE": 42
}
os.makedirs(CONFIG["OUTPUT_DIR"], exist_ok=True)

# ================================================================
# 2. Create / Load Lightweight Dataset
# ================================================================
FULL_FILE = "dec12_18features.csv"
SAMPLE_FILE = CONFIG["INPUT_CSV"]

if os.path.exists(FULL_FILE):
    df_small = pd.read_csv(FULL_FILE, nrows=800)
    df_small.to_csv(SAMPLE_FILE, index=False)
    print(f"Sample saved as {SAMPLE_FILE}")
else:
    print("⚠️ Please place Kaggle file dec12_18features.csv here first.")

df = pd.read_csv(SAMPLE_FILE)
print("Rows:", len(df), "Columns:", df.columns.tolist()[:10])

# ================================================================
# 3. Basic Cleaning
# ================================================================
df.rename(columns=lambda x: x.strip(), inplace=True)
if "eventTime" in df.columns:
    df["eventTime"] = pd.to_datetime(df["eventTime"], errors="coerce")

core = ["eventTime","eventName","awsRegion","sourceIPAddress","errorCode"]
for c in core:
    if c not in df.columns:
        df[c] = None
df.dropna(subset=["eventName"], inplace=True)

# ================================================================
# 4. CSPM‑Style Rule Flags
# ================================================================
df["suspicious_event"] = df["eventName"].isin(
    ["ConsoleLogin","AuthorizeSecurityGroupIngress","AttachRolePolicy",
     "CreateUser","DeleteBucket","PutBucketAcl","PutBucketPolicy"]
)
df["failed_login"] = (df["eventName"]=="ConsoleLogin") & df["errorCode"].notna()
df["rule_score"] = df[["suspicious_event","failed_login"]].sum(axis=1)

# ================================================================
# 5. Risk Scoring & Anomaly Detection (IsolationForest)
# ================================================================
df["risk_score"] = df["rule_score"]*10 + df["errorCode"].notna()*20
agg = df.groupby("sourceIPAddress").agg(event_count=("eventName","count")).reset_index()
scaler = StandardScaler()
iso = IsolationForest(random_state=CONFIG["ANOMALY_RANDOM_STATE"],
                      contamination=CONFIG["ISOLATION_FOREST_CONTAMINATION"])
agg["flag"] = iso.fit_predict(scaler.fit_transform(agg[["event_count"]]))
agg["ip_anomaly"] = agg["flag"]==-1
df = df.merge(agg[["sourceIPAddress","ip_anomaly"]], on="sourceIPAddress", how="left")
df["risk_score"] += df["ip_anomaly"].astype(int)*30

def classify(score):
    if score>=60: return "Critical"
    elif score>=40: return "High"
    elif score>=20: return "Medium"
    return "Low"
df["Risk_Level"] = df["risk_score"].apply(classify)

# ================================================================
# 6. Region‑wise Compliance Simulation
# ================================================================
df["Compliant"] = ~((df["eventName"].isin(["PutBucketAcl","PutBucketPolicy"])) |
                    (df["errorCode"].notna()))

region_summary = df.groupby("awsRegion").agg(
    events=("eventName","count"),
    avg_risk=("risk_score","mean"),
    compliance_pct=("Compliant", lambda x: x.mean()*100)
).reset_index()

# ================================================================
# 7. Save Key Outputs
# ================================================================
os.makedirs(CONFIG["OUTPUT_DIR"], exist_ok=True)
df.to_csv(os.path.join(CONFIG["OUTPUT_DIR"],"annotated_cloudtrail.csv"), index=False)
region_summary.to_csv(os.path.join(CONFIG["OUTPUT_DIR"],"region_summary.csv"), index=False)

print("All processed files saved to:", CONFIG["OUTPUT_DIR"])

# ================================================================
# 8. Visualizations (5 charts)
# ================================================================
# 8.1 Risk Level Pie
fig1 = px.pie(df, names="Risk_Level", title="Overall Risk Distribution",
              color="Risk_Level",
              color_discrete_map={"Low":"green","Medium":"orange","High":"red","Critical":"darkred"})
fig1.show()

# 8.2 Top Event Types by Frequency & Risk
top_events = df.groupby("eventName").agg(count=("eventName","count"),
                                         avg_risk=("risk_score","mean")).reset_index().sort_values("count",ascending=False).head(15)
fig2 = px.bar(top_events, x="eventName", y="count", color="avg_risk",
              title="Top Event Types (by Count and Average Risk)", color_continuous_scale="Reds")
fig2.update_layout(xaxis_tickangle=-45)
fig2.show()

# 8.3 Timeline of Events per Day
if "eventTime" in df.columns:
    tl = df.set_index("eventTime").resample("D").size().reset_index(name="daily_events")
    fig3 = px.line(tl, x="eventTime", y="daily_events",
                   title="Events Per Day (Timeline)")
    fig3.show()

# 8.4 Compliance % by Region
fig4 = px.bar(region_summary, x="awsRegion", y="compliance_pct",
              title="Compliance Percentage by Region",
              color="compliance_pct", color_continuous_scale="Tealgrn")
fig4.show()

# 8.5 Top Source IPs by Anomalous Behavior
ip_summary = df.groupby("sourceIPAddress").agg(
    events=("eventName","count"), risk_mean=("risk_score","mean"),
    anomaly=("ip_anomaly","any")).reset_index().sort_values("risk_mean",ascending=False).head(20)
fig5 = px.bar(ip_summary, x="sourceIPAddress", y="risk_mean",
              color="anomaly", title="Top IPs by Average Risk (Flag = Anomaly = True)")
fig5.update_layout(xaxis_tickangle=-45)
fig5.show()

# ================================================================
# 9. Mini Report Summary
# ================================================================
summary = {
    "total_events": int(len(df)),
    "unique_IPs": int(df["sourceIPAddress"].nunique()),
    "high_risk_events": int((df["Risk_Level"].isin(["High","Critical"])).sum()),
    "non_compliant_events": int((~df["Compliant"]).sum())
}
print(json.dumps(summary, indent=2))

print("\nCSPM Mini‑Project Completed — 5 interactive charts generated ✅")


Sample saved as cloudtrail_logs_sample.csv
Rows: 800 Columns: ['eventID', 'eventTime', 'sourceIPAddress', 'userAgent', 'eventName', 'eventSource', 'awsRegion', 'eventVersion', 'userIdentitytype', 'eventType']
All processed files saved to: cspm_outputs



The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



{
  "total_events": 800,
  "unique_IPs": 5,
  "high_risk_events": 85,
  "non_compliant_events": 100
}

CSPM Mini‑Project Completed — 5 interactive charts generated ✅
