In [5]:
# Install necessary dependencies
!pip install transformers pandas google-colab reportlab

Collecting reportlab
  Downloading reportlab-4.3.1-py3-none-any.whl.metadata (1.7 kB)
Collecting jedi>=0.16 (from ipython==7.34.0->google-colab)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading reportlab-4.3.1-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab, jedi
Successfully installed jedi-0.19.2 reportlab-4.3.1


In [6]:
import pandas as pd
import re
from google.colab import files
from transformers import pipeline
import google.generativeai as genai

In [7]:
# Initialize Gemini API key (make sure you have your Gemini API key from Google Cloud)
genai.configure(api_key="AIzaSyDGRQjmyxAEhLFmU7_c4DSf4W0JbwQtg44")  # Replace with your actual Gemini API key

In [8]:
# Upload CSV file
uploaded = files.upload()

# Read the uploaded CSV file
file_name = list(uploaded.keys())[0]  # Get the uploaded file name
df = pd.read_csv(file_name)  # Read CSV

# Extract text (assuming the first column contains the unstructured CTI report)
raw_cti_report = " ".join(df.iloc[:, 0].astype(str).tolist())  # Convert to string


Saving raw_cti_report.csv to raw_cti_report (1).csv


In [9]:
# Preprocess the extracted text
cleaned_text = preprocess_text(raw_cti_report)
print("Cleaned Text:\n", cleaned_text)

Cleaned Text:
 Hey team, just got some reports about a phishing campaign hitting financial orgs and government agencies in APAC. Looks like it started early March, some of the emails weve seen have subjects like Urgent Account Verification Required and Payroll Update  March 2025. The attachments are fake invoices Invoice_March2025.pdf.exe and macroenabled Word docs payment_receipt_2025.docm. If the user enables macros, it executes a PowerShell script update.ps1 which then downloads a secondstage payload from hxxpc2server.xyzsecondstage.bin. Looks like a RAT. Seeing multiple reports of outbound connections to 185.220.101.3 over HTTPS, possibly used as a C2. This IP was previously linked to APT29 activity but not confirmed yet. Also found some new domains registered last monthsecureloginbanking.com, clientupdate.xyz, govdocumentsauth.com. These were all set up within the last 30 days, so likely burner domains. Checking logs from a compromised endpoint, shows user opened the email, launch

In [13]:
# Define a fully autonomous prompt to Gemini model to extract relevant details
prompt = f"""
You are a Cybersecurity Threat Intelligence Analyst. Analyze the following Cyber Threat Intelligence (CTI) report and extract the following:

- ATT&CK Tactics
- ATT&CK Techniques (TXXXX format)
- Indicators of Compromise (IOCs)
- Any other relevant details that can help in understanding the attack
- Severity score (Low, Medium, High, Critical)
-Last give a small summary on how the attack has been taken by making it as a description

Text: {cleaned_text}
"""

# Use Gemini API to generate structured intelligence
model = genai.GenerativeModel("gemini-1.5-pro")
response = model.generate_content(prompt)  # No need for text=

# Extract structured output from Gemini response
structured_output = response.text  # Correct way to extract text

# Postprocess the model output to format the information into a report
def postprocess_output(output):
    """
    Formats and returns the extracted details from the AI-generated text.
    """
    return output.strip()

# Process the output from Gemini model
structured_text = postprocess_output(structured_output)

# Print structured text (No predefined IOCs or tactics, model handles it)
print("\nFormatted Structured Report:\n", structured_text)


Formatted Structured Report:
 ## CTI Report Analysis

**ATT&CK Tactics:**

* **Initial Access:** Phishing
* **Execution:** Command and Scripting Interpreter (PowerShell)
* **Persistence:** Registry Run Keys / Startup Folder, Scheduled Task/Job
* **Command and Control:**  Application Layer Protocol (HTTPS), DNS
* **Exfiltration:** DNS

**ATT&CK Techniques:**

* **T1566:** Phishing
* **T1059:** Command and Scripting Interpreter
* **T1547:** Boot or Logon Autostart Execution
* **T1053:** Scheduled Task/Job
* **T1071:** Application Layer Protocol
* **T1071.004:** DNS

**Indicators of Compromise (IOCs):**

* **Email Subjects:** `Urgent Account Verification Required`, `Payroll Update March 2025`
* **Attachments:** `Invoice_March2025.pdf.exe`, `payment_receipt_2025.docm`
* **PowerShell Script:** `update.ps1`
* **Payload URL:** `hxxpc2server.xyz/secondstage.bin`
* **C2 IP:** `185.220.101.3` (HTTPS)
* **Domains:** `secureloginbanking.com`, `clientupdate.xyz`, `govdocumentsauth.com`


**Other R

In [15]:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

def generate_pdf_report(structured_text, filename="CTI_Structured_Report.pdf"):
    """
    Converts structured threat intelligence text into a PDF report.
    """
    c = canvas.Canvas(filename, pagesize=letter)
    width, height = letter
    c.setFont("Helvetica-Bold", 16)

    # Title
    c.drawString(100, height - 50, "Cyber Threat Intelligence Report")
    c.setFont("Helvetica", 12)
    y_position = height - 80  # Adjust starting height for content
    line_height = 14  # Line spacing

    # Function to write text while handling page overflow
    def write_text(text, font="Helvetica", size=12):
        nonlocal y_position
        c.setFont(font, size)
        lines = text.split("\n")

        for line in lines:
            if y_position - line_height < 40:  # Avoid cutting off text at bottom
                c.showPage()  # New page
                c.setFont(font, size)
                y_position = height - 50  # Reset position at the top

            c.drawString(100, y_position, line)
            y_position -= line_height  # Move cursor down

    # Write the structured report into the PDF
    write_text(structured_text)

    # Save PDF
    c.save()
    print(f"✅ Report saved as '{filename}'")

    # Download PDF (Colab only)
    from google.colab import files
    files.download(filename)

# ✅ Generate PDF from structured text
generate_pdf_report(structured_text)


✅ Report saved as 'CTI_Structured_Report.pdf'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>