In [None]:
!pip install cvss

import pandas as pd
from cvss import CVSS3
import pickle

Collecting cvss
  Downloading cvss-3.3-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading cvss-3.3-py2.py3-none-any.whl (30 kB)
Installing collected packages: cvss
Successfully installed cvss-3.3


In [None]:
import google.generativeai as genai
import csv
import time
from itertools import islice

# Function to generate an answer using Google Generative AI
def generate_answer(prompt, api_keys):
    if not api_keys or len(api_keys) == 0:
        raise ValueError("No API keys provided.")

    for api_key in api_keys:
        try:
            # Configure the API
            genai.configure(api_key=api_key)

            # Configuration for the model
            generation_config = {
                "temperature": 0.7,
                "max_output_tokens": 150,
            }

            # Call the generative model
            model = genai.GenerativeModel(model_name="gemini-1.0-pro", generation_config=generation_config)
            response = model.generate_content(prompt)

            # Check if the response has candidates and content
            if response.candidates and len(response.candidates[0].content.parts) > 0:
                generated_text = response.candidates[0].content.parts[0].text.strip()
                return generated_text
            else:
                print(f"API key {api_key} returned no content, trying next key...")
        except Exception as e:
            print(f"Error with API key {api_key}: {e}, trying next key...")

    return "Failed to generate content with all provided API keys."

# Main function to process a TSV file of CVE descriptions and save the results to a file
def process_cve_file(filename, api_keys, limit=10, output_file="results.txt"):
    with open(filename, mode='r') as file:
        csv_reader = csv.DictReader(file, delimiter='\t')  # Specify '\t' as the delimiter for TSV
        # Use islice to limit the number of rows processed
        limited_rows = islice(csv_reader, limit)

        # Open output file to write results
        with open(output_file, mode='w') as out_file:
            out_file.write("CVE Description\tGenerated Output\n")  # Header

            for row in limited_rows:
                cve_description = row['Description'].strip()  # Use the 'Description' column

                # Create the prompt
                prompt = f"""You are a cybersecurity expert specializing in cyber threat intelligence.
                Analyze the following CVE description and map it to the appropriate CWE.
                Provide a brief justification for your choice.
                Ensure the last line of your response contains only the CWE ID.
                CVE Description: {cve_description}"""

                # Generate the response
                generated_text = generate_answer(prompt, api_keys)

                # Write the result to the output file
                out_file.write(f"{cve_description}\t{generated_text}\n")

                # Print the result to console (optional)
                print(f"CVE Description: {cve_description}")
                print(f"Generated Output: {generated_text}\n")

                # Wait for 4 seconds before making the next request
                time.sleep(5)

# Example usage
if __name__ == "__main__":
    api_keys = [
        "AIzaSyA6C0BzRxHy75Z3nW2eULOMfdlOo4BNqaM",  # Replace with your actual API keys
        "AIzaSyDYcgZQKq-xL97xcLg2lCrJQvO5wZau4Xg",
        "AIzaSyDhBBf2ompDiRdr2caqBVtB5ClFWqT8zTM"
    ]
    process_cve_file("/content/drive/MyDrive/dataset/data/cti-rcm.tsv", api_keys, limit=200, output_file="results.txt")  # Process only 5 rows and save to results.txt


API key AIzaSyA6C0BzRxHy75Z3nW2eULOMfdlOo4BNqaM returned no content, trying next key...
API key AIzaSyDYcgZQKq-xL97xcLg2lCrJQvO5wZau4Xg returned no content, trying next key...
API key AIzaSyDhBBf2ompDiRdr2caqBVtB5ClFWqT8zTM returned no content, trying next key...
CVE Description: In the Linux kernel through 6.7.1, there is a use-after-free in cec_queue_msg_fh, related to drivers/media/cec/core/cec-adap.c and drivers/media/cec/core/cec-api.c.
Generated Output: Failed to generate content with all provided API keys.

CVE Description: IBM OpenPages with Watson 8.3 and 9.0 could provide weaker than expected security in a OpenPages environment using Native authentication. If OpenPages is using Native authentication an attacker with access to the OpenPages database could through a series of specially crafted steps could exploit this weakness and gain unauthorized access to other OpenPages accounts. IBM X-Force ID: 262594.
Generated Output: The CVE description indicates that an attacker could 



Error with API key AIzaSyA6C0BzRxHy75Z3nW2eULOMfdlOo4BNqaM: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.0-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota)., trying next key...
API key AIzaSyDYcgZQKq-xL97xcLg2lCrJQvO5wZau4Xg returned no content, trying next key...
CVE Description: IBM WebSphere Application Server 8.5 and 9.0 could provide weaker than expected security for outbound TLS connections caused by a failure to honor user configuration.  IBM X-Force ID:  274812.
Generated Output: The CVE description indicates that the IBM WebSphere Application Server fails to honor user configuration for outbound TLS connections, leading to weaker security. This suggests a configuration error that allows an attacker to bypass security controls. The appropriate CWE is therefore:

CWE-200: Information Exposure due to Improper Input Handling

CVE Description: An invalid memory write issue in Jasper-Software Jasper v.



Error with API key AIzaSyDYcgZQKq-xL97xcLg2lCrJQvO5wZau4Xg: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.0-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota)., trying next key...




Error with API key AIzaSyDhBBf2ompDiRdr2caqBVtB5ClFWqT8zTM: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.0-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota)., trying next key...
CVE Description: In the Linux kernel, the following vulnerability has been resolved:  tls: fix race between async notify and socket close  The submitting thread (one which called recvmsg/sendmsg) may exit as soon as the async crypto handler calls complete() so any code past that point risks touching already freed data.  Try to avoid the locking and extra flags altogether. Have the main thread hold an extra reference, this way we can depend solely on the atomic ref counter for synchronization.  Don't futz with reiniting the completion, either, we are now tightly controlling when completion fires.
Generated Output: Failed to generate content with all provided API keys.



In [None]:
# Function to clean the text file by removing "No content generated" and "Justification" entries
def clean_text_file(input_file, output_file):
    with open(input_file, 'r') as file:
        lines = file.readlines()

    cleaned_lines = []
    skip_next = False

    for i in range(len(lines)):
        line = lines[i].strip()

        if skip_next:
            skip_next = False
            continue

        if "No content generated." in line or "Justification" in line:
            # Remove the corresponding CVE description (previous line) and skip the next line
            cleaned_lines.pop()  # Remove the corresponding CVE description
            skip_next = True
        else:
            cleaned_lines.append(line)

    # Write the cleaned data to a new file
    with open(output_file, 'w') as file:
        for line in cleaned_lines:
            file.write(line + '\n')

# Example usage
input_file = 'results.txt'  # Replace with your input text file
output_file = 'cleaned_file.txt'    # Replace with your desired output text file
clean_text_file(input_file, output_file)

print(f"Cleaned file '{output_file}' created successfully.")


Cleaned file 'cleaned_file.txt' created successfully.


In [None]:
import csv
import re

# Function to process the input file and structure the dataset
def process_cve_to_csv(input_file, output_file):
    with open(input_file, 'r') as file:
        lines = file.readlines()

    # Initialize variables for CVE Descriptions and CWE IDs
    cve_descriptions = []
    cwe_ids = []

    for i, line in enumerate(lines):
        line = line.strip()

        # Check if the line contains "CWE-XXX" for CWE ID
        cwe_match = re.match(r"(CWE-\d+)", line)
        if cwe_match:
            cwe_ids.append(cwe_match.group(1))  # Extract CWE ID
        elif line:  # Otherwise, assume it's part of the CVE Description
            # Add description only if it's not a CWE or invalid content
            if "Failed to generate content" not in line and "CVE Description" not in line:
                cve_descriptions.append(line)

    # Ensure descriptions and CWE IDs align
    while len(cve_descriptions) > len(cwe_ids):
        cwe_ids.append("")  # Add empty IDs for unmatched descriptions
    while len(cwe_ids) > len(cve_descriptions):
        cve_descriptions.append("")  # Add empty descriptions for unmatched IDs

    # Write to CSV
    with open(output_file, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['CVE Description', 'CWE ID'])  # Header row
        for description, cwe_id in zip(cve_descriptions, cwe_ids):
            csvwriter.writerow([description, cwe_id])

# Input and output file paths
input_file_path = 'cleaned_file.txt'
output_file_path = 'structured_cve_output.csv'

# Process the file
process_cve_to_csv(input_file_path, output_file_path)

output_file_path


'structured_cve_output.csv'

In [None]:
pip install fuzzywuzzy[speedup]


Collecting fuzzywuzzy[speedup]
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-levenshtein>=0.12 (from fuzzywuzzy[speedup])
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Do

In [None]:
import pandas as pd
from fuzzywuzzy import process

def fuzzy_merge(file1, file2, output_file, threshold=80):
    # Read the datasets
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2, sep='\t')  # Assuming the second file is tab-separated

    # Normalize columns
    df1['CVE Description'] = df1['CVE Description'].str.strip().str.lower()
    df2['Description'] = df2['Description'].str.strip().str.lower()

    # Prepare results
    matches = []

    for desc in df1['CVE Description']:
        match_data = process.extractOne(desc, df2['Description'])
        if match_data:
            match, score, _ = match_data  # Unpack match, score, and index
            if score >= threshold:
                matches.append({'CVE Description': desc, 'Description': match, 'Score': score})

    # Create a DataFrame with matched results
    match_df = pd.DataFrame(matches)

    # Merge results with the original data
    merged_df = pd.merge(df1, match_df, on='CVE Description', how='inner')
    merged_df = pd.merge(merged_df, df2, left_on='Description', right_on='Description', how='inner')

    # Write to a CSV
    merged_df.to_csv(output_file, index=False)
    print(f"Fuzzy matched data has been written to {output_file}")


# Paths

file1 = 'structured_cve_output.csv'  # Replace with your first dataset path
file2 = '/content/drive/MyDrive/dataset/data/cti-rcm.tsv'  # Replace with your second dataset path
output_file = 'merged_output.csv'  # Replace with your desired output file path
# Run fuzzy matching merge
fuzzy_merge(file1, file2, output_file)




Fuzzy matched data has been written to merged_output.csv


In [None]:
import pandas as pd

def compare_columns(file_path, column1, column2):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)

    # Convert both columns to uppercase to ensure case-insensitive comparison
    df[column1] = df[column1].str.upper()
    df[column2] = df[column2].str.upper()

    # Calculate the number of matches and differences
    matches = (df[column1] == df[column2]).sum()
    differences = (df[column1] != df[column2]).sum()

    # Calculate the total number of rows
    total = len(df)

    # Calculate the taux (percentage) of matches and differences
    match_taux = (matches / total) * 100
    difference_taux = (differences / total) * 100

    return match_taux, difference_taux

def main():
    file_path = '/content/merged_output.csv'  # Replace with your actual CSV file path
    column1 = 'CWE ID'  # The first column to compare
    column2 = 'GT'          # The second column to compare

    # Get the taux of matches and differences
    match_taux, difference_taux = compare_columns(file_path, column1, column2)

    # Print the results
    print(f"Match Taux: {match_taux:.2f}%")
    print(f"Difference Taux: {difference_taux:.2f}%")

if __name__ == "__main__":
    main()


Match Taux: 27.21%
Difference Taux: 72.79%


In [None]:
import pandas as pd

def compute_rcm_accuracy(fname, pred_col):
    # Load the dataset
    df = pd.read_csv(fname)

    # Print the available columns to verify
    print("Available columns:", df.columns)

    # Ensure the required columns exist in the dataframe
    if pred_col not in df.columns:
        raise KeyError(f"Column '{pred_col}' not found in the dataset.")
    if 'GT' not in df.columns:
        raise KeyError("'GT' column not found in the dataset.")

    correct = 0
    total = 0
    for idx, row in df.iterrows():
        # Convert the prediction and ground truth to uppercase strings
        pred = str(row[pred_col]).upper()
        gt = str(row['GT']).upper()

        # Handle invalid predictions
        if pred.startswith('CWE-'):
            total += 1
        else:
            print(f"Invalid response at row {idx+1}: {pred}")  # Log invalid responses

        if pred == gt:
            correct += 1

    if total == 0:
        print("No valid predictions found to calculate accuracy.")
        return 0  # Prevent division by zero

    accuracy = (correct / total) * 100
    return accuracy

# Example usage
fname = '/content/merged_output.csv'  # Replace with your actual file path
pred_col = 'CWE ID'  # Adjust to the actual column name with the model's predictions
accuracy = compute_rcm_accuracy(fname, pred_col)
print(f"Accuracy: {accuracy:.2f}%")


Available columns: Index(['CVE Description', 'CWE ID', 'Description', 'Score', 'URL', 'Prompt',
       'GT'],
      dtype='object')
Invalid response at row 91: NAN
Invalid response at row 92: NAN
Invalid response at row 93: NAN
Invalid response at row 94: NAN
Invalid response at row 95: NAN
Invalid response at row 96: NAN
Invalid response at row 97: NAN
Invalid response at row 98: NAN
Invalid response at row 99: NAN
Invalid response at row 101: NAN
Invalid response at row 102: NAN
Invalid response at row 103: NAN
Invalid response at row 104: NAN
Invalid response at row 105: NAN
Invalid response at row 106: NAN
Invalid response at row 107: NAN
Invalid response at row 108: NAN
Invalid response at row 109: NAN
Invalid response at row 110: NAN
Invalid response at row 111: NAN
Invalid response at row 112: NAN
Invalid response at row 113: NAN
Invalid response at row 114: NAN
Invalid response at row 115: NAN
Invalid response at row 116: NAN
Invalid response at row 117: NAN
Invalid response at