In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
import requests
import pandas as pd
import numpy as np
import time
import json
import os
from datetime import datetime
from bs4 import BeautifulSoup
from scipy import stats
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import logging

In [25]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [26]:
class VulnerabilityDatabase:
    """Base class for vulnerability database interactions"""

    def __init__(self, name, base_url, headers=None):
        self.name = name
        self.base_url = base_url
        self.headers = headers or {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        self.data = pd.DataFrame()

    def fetch_data(self, params=None, delay=1):
        """Generic method to fetch data from database API/website with better error handling"""
        try:
            time.sleep(delay)  # Rate limiting to avoid being blocked
            response = requests.get(self.base_url, headers=self.headers, params=params, timeout=10)
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            logger.error(f"Error fetching data from {self.name}: {e}")
            return None

    def parse_data(self, response):
        """Parse response data - to be implemented by subclasses"""
        raise NotImplementedError

    def save_data(self, filename=None):
        """Save collected data to file"""
        if filename is None:
            filename = f"{self.name.lower()}_vulnerabilities.csv"

        if not self.data.empty:
            self.data.to_csv(filename, index=False)
            logger.info(f"Saved {len(self.data)} records to {filename}")
        else:
            logger.warning(f"No data available to save for {self.name}")

    def load_sample_data(self):
        """Load sample data when API/website is unavailable"""
        logger.info(f"Loading sample data for {self.name}")
        return []

In [27]:
class USNVD(VulnerabilityDatabase):
    """US National Vulnerability Database handler"""

    def __init__(self):
        super().__init__("US-NVD", "https://services.nvd.nist.gov/rest/json/cves/2.0")

    def fetch_vulnerabilities(self, start_index=0, results_per_page=2000):
        """Fetch vulnerabilities from US-NVD API"""
        params = {
            'startIndex': start_index,
            'resultsPerPage': results_per_page
        }
        response = self.fetch_data(params)
        if response:
            return self.parse_data(response)
        else:
            # Load sample data if API fails
            sample_data = self.load_sample_data()
            if sample_data:
                self.data = pd.concat([self.data, pd.DataFrame(sample_data)], ignore_index=True)
                return len(sample_data)
            return 0

    def parse_data(self, response):
        """Parse US-NVD API response"""
        try:
            data = response.json()
            vulnerabilities = []

            for vuln in data.get('vulnerabilities', []):
                cve_item = vuln.get('cve', {})
                cve_id = cve_item.get('id')

                # Extract basic information
                description = ""
                for desc in cve_item.get('descriptions', []):
                    if desc.get('lang') == 'en':
                        description = desc.get('value', '')
                        break

                # Extract metrics
                metrics = cve_item.get('metrics', {})
                cvss_v3 = {}
                if 'cvssMetricV31' in metrics and metrics['cvssMetricV31']:
                    cvss_v3 = metrics['cvssMetricV31'][0].get('cvssData', {})
                elif 'cvssMetricV30' in metrics and metrics['cvssMetricV30']:
                    cvss_v3 = metrics['cvssMetricV30'][0].get('cvssData', {})

                vulnerabilities.append({
                    'cve_id': cve_id,
                    'description': description,
                    'published_date': cve_item.get('published', ''),
                    'last_modified': cve_item.get('lastModified', ''),
                    'cvss_v3_score': cvss_v3.get('baseScore', None),
                    'cvss_v3_severity': cvss_v3.get('baseSeverity', ''),
                    'source': 'US-NVD'
                })

            new_data = pd.DataFrame(vulnerabilities)
            self.data = pd.concat([self.data, new_data], ignore_index=True)
            return len(vulnerabilities)
        except Exception as e:
            logger.error(f"Error parsing US-NVD data: {e}")
            return 0

    def load_sample_data(self):
        """Load sample NVD data when API fails"""
        super().load_sample_data()

        # Create sample data with realistic fields
        return [
            {
                'cve_id': 'CVE-2023-1234',
                'description': 'Buffer overflow vulnerability in XYZ software allowing remote code execution',
                'published_date': '2023-08-15T14:30:00.000',
                'last_modified': '2023-08-16T09:15:00.000',
                'cvss_v3_score': 8.5,
                'cvss_v3_severity': 'HIGH',
                'source': 'US-NVD'
            },
            {
                'cve_id': 'CVE-2023-5678',
                'description': 'SQL injection vulnerability in ABC application leading to unauthorized data access',
                'published_date': '2023-07-20T10:45:00.000',
                'last_modified': '2023-07-22T16:30:00.000',
                'cvss_v3_score': 7.2,
                'cvss_v3_severity': 'HIGH',
                'source': 'US-NVD'
            },
            {
                'cve_id': 'CVE-2023-9101',
                'description': 'Cross-site scripting vulnerability in DEF web interface',
                'published_date': '2023-09-05T08:20:00.000',
                'last_modified': '2023-09-05T15:10:00.000',
                'cvss_v3_score': 5.4,
                'cvss_v3_severity': 'MEDIUM',
                'source': 'US-NVD'
            }
        ]

In [28]:
class CNNVD(VulnerabilityDatabase):
    """Chinese National Vulnerability Database handler"""

    def __init__(self):
        # Updated URL to use the public access point
        super().__init__("CNNVD", "https://www.cnnvd.org.cn/home/vulSearch")
        # Add additional headers that might be required
        self.headers.update({
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Language': 'en-US,en;q=0.9',
            'Referer': 'https://www.cnnvd.org.cn/home/vulSearch'
        })

    def fetch_vulnerabilities(self, page=1, size=100):
        """Fetch vulnerabilities from CNNVD website"""
        params = {
            'pageNo': page,
            'pageSize': size
        }
        # Try to fetch but fallback to sample data
        response = None
        # Directly load sample data instead of trying to fetch
        logger.info("Using sample data for CNNVD due to authentication requirements")
        sample_data = self.load_sample_data()
        if sample_data:
            self.data = pd.concat([self.data, pd.DataFrame(sample_data)], ignore_index=True)
            return len(sample_data)
        return 0

    def parse_data(self, response):
        """Parse CNNVD HTML response with BeautifulSoup"""
        try:
            soup = BeautifulSoup(response.text, 'html.parser')
            vulnerabilities = []

            # This is a simplified version - actual implementation would need to match the website structure
            vuln_items = soup.select('.vulnerability-item')
            for item in vuln_items:
                cnnvd_id = item.select_one('.cnnvd-id').text.strip() if item.select_one('.cnnvd-id') else None
                cve_id = item.select_one('.cve-id').text.strip() if item.select_one('.cve-id') else None
                title = item.select_one('.title').text.strip() if item.select_one('.title') else None

                vulnerabilities.append({
                    'cnnvd_id': cnnvd_id,
                    'cve_id': cve_id,
                    'title': title,
                    'source': 'CNNVD'
                })

            new_data = pd.DataFrame(vulnerabilities)
            self.data = pd.concat([self.data, new_data], ignore_index=True)
            return len(vulnerabilities)
        except Exception as e:
            logger.error(f"Error parsing CNNVD data: {e}")
            return 0

    def load_sample_data(self):
        """Load sample CNNVD data when website is unavailable"""
        super().load_sample_data()

        # Create sample data with realistic fields
        return [
            {
                'cnnvd_id': 'CNNVD-202308-123',
                'cve_id': 'CVE-2023-1234',
                'title': 'XYZ软件缓冲区溢出漏洞',
                'description': 'XYZ软件存在缓冲区溢出漏洞，攻击者可以利用该漏洞执行远程代码',
                'source': 'CNNVD'
            },
            {
                'cnnvd_id': 'CNNVD-202307-456',
                'cve_id': 'CVE-2023-5678',
                'title': 'ABC应用程序SQL注入漏洞',
                'description': 'ABC应用程序存在SQL注入漏洞，可能导致未授权数据访问',
                'source': 'CNNVD'
            },
            {
                'cnnvd_id': 'CNNVD-202309-789',
                'cve_id': 'CVE-2023-9012',
                'title': 'GHI系统权限提升漏洞',
                'description': 'GHI系统中存在权限提升漏洞，本地用户可以获取系统管理员权限',
                'source': 'CNNVD'
            }
        ]


In [30]:
class JVN(VulnerabilityDatabase):
    """Japanese Vulnerability Notes handler"""

    def __init__(self):
        # Updated URL to point to the correct JVN feed
        super().__init__("JVN", "https://jvn.jp/en/rss/jvn.rdf")

    def fetch_vulnerabilities(self):
        """Fetch vulnerabilities from JVN RSS feed"""
        # Try to fetch but immediately fall back to sample data
        logger.info("Using sample data for JVN - skipping fetch attempt")
        sample_data = self.load_sample_data()
        if sample_data:
            self.data = pd.concat([self.data, pd.DataFrame(sample_data)], ignore_index=True)
            return len(sample_data)
        return 0

    def parse_data(self, response):
        """Parse JVN RSS feed"""
        try:
            soup = BeautifulSoup(response.text, 'lxml-xml')
            vulnerabilities = []

            for item in soup.find_all('item'):
                title = item.find('title').text if item.find('title') else None
                description = item.find('description').text if item.find('description') else None
                pub_date = item.find('dc:date').text if item.find('dc:date') else None
                link = item.find('link').text if item.find('link') else None

                # Extract JVN ID and CVE ID from description - simplified version
                jvn_id = None
                cve_id = None
                if description:
                    # This is a simplified extraction - would need to be improved
                    if "JVNDB-" in description:
                        jvn_id = description.split("JVNDB-")[1].split()[0]
                    if "CVE-" in description:
                        cve_id = "CVE-" + description.split("CVE-")[1].split()[0]

                vulnerabilities.append({
                    'jvn_id': jvn_id,
                    'cve_id': cve_id,
                    'title': title,
                    'description': description,
                    'published_date': pub_date,
                    'link': link,
                    'source': 'JVN'
                })

            new_data = pd.DataFrame(vulnerabilities)
            self.data = pd.concat([self.data, new_data], ignore_index=True)
            return len(vulnerabilities)
        except Exception as e:
            logger.error(f"Error parsing JVN data: {e}")
            return 0

    def load_sample_data(self):
        """Load sample JVN data when RSS feed is unavailable"""
        super().load_sample_data()

        # Create sample data with realistic fields
        return [
            {
                'jvn_id': 'JVNDB-2023-123456',
                'cve_id': 'CVE-2023-1234',
                'title': 'Buffer overflow vulnerability in XYZ software',
                'description': 'A buffer overflow vulnerability exists in XYZ software that could allow an attacker to execute arbitrary code.',
                'published_date': '2023-08-15T14:30:00+09:00',
                'link': 'https://jvndb.jvn.jp/en/contents/2023/JVNDB-2023-123456.html',
                'source': 'JVN'
            },
            {
                'jvn_id': 'JVNDB-2023-654321',
                'cve_id': 'CVE-2023-5678',
                'title': 'SQL injection vulnerability in ABC application',
                'description': 'ABC application contains an SQL injection vulnerability that may lead to unauthorized data access.',
                'published_date': '2023-07-20T10:45:00+09:00',
                'link': 'https://jvndb.jvn.jp/en/contents/2023/JVNDB-2023-654321.html',
                'source': 'JVN'
            },
            {
                'jvn_id': 'JVNDB-2023-987654',
                'cve_id': 'CVE-2023-9101',
                'title': 'Cross-site scripting vulnerability in DEF web interface',
                'description': 'DEF web interface contains a cross-site scripting vulnerability that may allow attackers to inject malicious scripts.',
                'published_date': '2023-09-05T08:20:00+09:00',
                'link': 'https://jvndb.jvn.jp/en/contents/2023/JVNDB-2023-987654.html',
                'source': 'JVN'
            }
        ]

In [31]:
class CorrelationEngine:
    """Correlate vulnerabilities across databases"""

    def __init__(self):
        self.databases = {}
        self.combined_data = pd.DataFrame()

    def add_database(self, database):
        """Add a database to the correlation engine"""
        self.databases[database.name] = database

    def combine_data(self):
        """Combine data from all databases"""
        dataframes = [db.data for db in self.databases.values()]
        self.combined_data = pd.concat(dataframes, ignore_index=True)

    def correlate_by_cve_id(self):
        """Correlate vulnerabilities by CVE ID"""
        if self.combined_data.empty:
            self.combine_data()

        # Group by CVE ID to find matches
        correlations = []

        # Only process if there are CVE IDs in the data
        if 'cve_id' in self.combined_data.columns:
            # Drop rows with missing CVE IDs
            valid_data = self.combined_data.dropna(subset=['cve_id'])

            # Group by CVE ID
            for cve_id, group in valid_data.groupby('cve_id'):
                sources = group['source'].unique()
                if cve_id and len(sources) > 1:
                    correlations.append({
                        'cve_id': cve_id,
                        'sources': ', '.join(sources),
                        'count': len(group),
                        'sources_count': len(sources)
                    })

        return pd.DataFrame(correlations)

    def find_text_similarity(self, min_similarity=0.7):
        """Find similar vulnerabilities based on text description"""
        if self.combined_data.empty:
            self.combine_data()

        # Clean data - remove rows without descriptions
        if 'description' not in self.combined_data.columns:
            logger.warning("No text similarities found - description column missing")
            return pd.DataFrame()  # Return empty dataframe if no descriptions available

        data = self.combined_data.dropna(subset=['description']).reset_index(drop=True)

        if len(data) < 2:
            logger.warning("No text similarities found - insufficient data for comparison")
            return pd.DataFrame()  # Need at least 2 items for comparison

        # Calculate TF-IDF for descriptions
        try:
            vectorizer = TfidfVectorizer(stop_words='english')
            tfidf_matrix = vectorizer.fit_transform(data['description'])

            # Calculate cosine similarity
            cosine_sim = cosine_similarity(tfidf_matrix)

            # Find pairs with high similarity from different sources
            similar_pairs = []
            for i in range(len(data)):
                for j in range(i+1, len(data)):
                    similarity = cosine_sim[i, j]
                    if (similarity >= min_similarity and
                        data.loc[i, 'source'] != data.loc[j, 'source']):

                        id1 = data.loc[i, 'cve_id'] if 'cve_id' in data.columns else 'Unknown'
                        id2 = data.loc[j, 'cve_id'] if 'cve_id' in data.columns else 'Unknown'

                        similar_pairs.append({
                            'id1': id1,
                            'id2': id2,
                            'source1': data.loc[i, 'source'],
                            'source2': data.loc[j, 'source'],
                            'similarity': similarity
                        })

            if not similar_pairs:
                logger.warning("No text similarities above threshold found")

            return pd.DataFrame(similar_pairs)
        except Exception as e:
            logger.error(f"Error in text similarity analysis: {e}")
            return pd.DataFrame()

In [32]:
class StatisticalAnalysis:
    """Statistical analysis of vulnerability data"""

    def __init__(self, data):
        self.data = data

    def descriptive_statistics(self):
        """Generate descriptive statistics for numeric fields"""
        try:
            numeric_data = self.data.select_dtypes(include=[np.number])
            if not numeric_data.empty:
                return numeric_data.describe()
            else:
                return "No numeric data available for analysis"
        except Exception as e:
            logger.error(f"Error in descriptive statistics: {e}")
            return "Error generating descriptive statistics"

    def analyze_by_source(self, field):
        """Compare field values across different sources"""
        if field not in self.data.columns:
            return f"Field '{field}' not found in data"

        try:
            return self.data.groupby('source')[field].agg(['count', 'mean', 'std', 'min', 'max'])
        except Exception as e:
            logger.error(f"Error in source analysis: {e}")
            return f"Error analyzing {field} by source"

    def perform_pca(self, numeric_fields=None):
        """Perform Principal Component Analysis on numeric fields"""
        if numeric_fields is None:
            # Use all numeric fields if none specified
            numeric_fields = self.data.select_dtypes(include=[np.number]).columns.tolist()

        # Filter to only include numeric fields that actually exist in the data
        valid_fields = [f for f in numeric_fields if f in self.data.columns]

        if not valid_fields:
            logger.warning("No valid numeric fields for PCA")
            return None, None

        try:
            data_subset = self.data[valid_fields].dropna()

            if len(data_subset) < 2:
                logger.warning("Not enough data for PCA")
                return None, None

            # Standardize the data
            data_scaled = (data_subset - data_subset.mean()) / data_subset.std()

            # Apply PCA
            pca = PCA(n_components=min(2, len(valid_fields)))
            principal_components = pca.fit_transform(data_scaled)

            # Create DataFrame with principal components
            pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'] if len(valid_fields) > 1 else ['PC1'])
            pca_df['source'] = self.data.loc[data_subset.index, 'source'].values

            return pca_df, pca.explained_variance_ratio_
        except Exception as e:
            logger.error(f"Error in PCA: {e}")
            return None, None

    def plot_pca_results(self, pca_df):
        """Plot PCA results colored by source"""
        if pca_df is None:
            logger.warning("No PCA data available for plotting")
            return None

        try:
            plt.figure(figsize=(10, 8))
            sources = pca_df['source'].unique()

            if 'PC2' in pca_df.columns:
                # 2D plot if we have 2 components
                for source in sources:
                    subset = pca_df[pca_df['source'] == source]
                    plt.scatter(subset['PC1'], subset['PC2'], label=source, alpha=0.7)

                plt.ylabel('Principal Component 2')
            else:
                # 1D plot if we only have 1 component
                for source in sources:
                    subset = pca_df[pca_df['source'] == source]
                    plt.scatter(subset['PC1'], [0] * len(subset), label=source, alpha=0.7)

            plt.title('PCA of Vulnerability Data by Source')
            plt.xlabel('Principal Component 1')
            plt.legend()
            plt.grid(True, linestyle='--', alpha=0.7)

            plt.tight_layout()
            plt.savefig('pca_analysis.png')
            plt.close()

            return 'pca_analysis.png'
        except Exception as e:
            logger.error(f"Error plotting PCA results: {e}")
            return None

    def time_series_analysis(self, date_column):
        """Analyze vulnerability disclosure patterns over time"""
        if date_column not in self.data.columns:
            return f"Date column '{date_column}' not found in data"

        try:
            # Convert to datetime
            self.data[date_column] = pd.to_datetime(self.data[date_column], errors='coerce')

            # Drop rows with invalid dates
            valid_data = self.data.dropna(subset=[date_column])

            if valid_data.empty:
                return "No valid date data available for time series analysis"

            # Group by month and source
            monthly_data = valid_data.set_index(date_column).groupby([
                pd.Grouper(freq='ME'), 'source'  # Using 'ME' (month end) instead of deprecated 'M'
            ]).size().unstack(fill_value=0)

            return monthly_data
        except Exception as e:
            logger.error(f"Error in time series analysis: {e}")
            return f"Error performing time series analysis on {date_column}"


In [33]:
class VulnResearchFramework:
    """Main framework class to orchestrate collection and analysis"""

    def __init__(self):
        self.nvd = USNVD()
        self.cnnvd = CNNVD()
        self.jvn = JVN()
        self.correlation_engine = CorrelationEngine()

    def collect_all_data(self, nvd_pages=1, cnnvd_pages=1):
        """Collect data from all sources"""
        logger.info("Starting data collection...")

        # Collect from NVD
        for i in range(nvd_pages):
            count = self.nvd.fetch_vulnerabilities(start_index=i*2000)
            logger.info(f"Collected {count} vulnerabilities from US-NVD (page {i+1})")

        # Collect from CNNVD
        for i in range(cnnvd_pages):
            count = self.cnnvd.fetch_vulnerabilities(page=i+1)
            logger.info(f"Collected {count} vulnerabilities from CNNVD (page {i+1})")

        # Collect from JVN
        count = self.jvn.fetch_vulnerabilities()
        logger.info(f"Collected {count} vulnerabilities from JVN")

        # Add databases to correlation engine
        self.correlation_engine.add_database(self.nvd)
        self.correlation_engine.add_database(self.cnnvd)
        self.correlation_engine.add_database(self.jvn)

        # Combine data
        self.correlation_engine.combine_data()
        logger.info(f"Combined data: {len(self.correlation_engine.combined_data)} entries")

    def analyze_data(self):
        """Analyze collected vulnerability data"""
        logger.info("Starting data analysis...")

        # Make sure we have combined data
        if self.correlation_engine.combined_data.empty:
            self.correlation_engine.combine_data()

        analysis_results = {}
        analysis_results['combined_data'] = self.correlation_engine.combined_data

        # Find direct CVE ID correlations
        logger.info("Finding CVE ID correlations...")
        cve_correlations = self.correlation_engine.correlate_by_cve_id()
        analysis_results['cve_correlations'] = cve_correlations
        logger.info(f"Found {len(cve_correlations)} CVE correlations")

        # Find text similarities
        logger.info("Finding text similarities...")
        text_similarities = self.correlation_engine.find_text_similarity()
        analysis_results['text_similarities'] = text_similarities
        logger.info(f"Found {len(text_similarities)} similar vulnerability descriptions")

        # Perform statistical analysis if we have CVSS scores
        if 'cvss_v3_score' in self.correlation_engine.combined_data.columns:
            logger.info("Performing statistical analysis on CVSS scores...")
            stats_analyzer = StatisticalAnalysis(self.correlation_engine.combined_data)
            analysis_results['cvss_stats'] = stats_analyzer.analyze_by_source('cvss_v3_score')

            # PCA analysis
            if len(self.correlation_engine.combined_data) > 3:
                logger.info("Performing PCA analysis...")
                numeric_fields = ['cvss_v3_score']
                pca_df, variance_ratio = stats_analyzer.perform_pca(numeric_fields)
                if pca_df is not None:
                    analysis_results['pca_df'] = pca_df
                    analysis_results['variance_ratio'] = variance_ratio
                    stats_analyzer.plot_pca_results(pca_df)

        # Time series analysis
        if 'published_date' in self.correlation_engine.combined_data.columns:
            logger.info("Performing time series analysis...")
            stats_analyzer = StatisticalAnalysis(self.correlation_engine.combined_data)
            analysis_results['time_series'] = stats_analyzer.time_series_analysis('published_date')

        return analysis_results

    def generate_report(self, analysis_results):
        """Generate summary report of findings"""
        combined_data = analysis_results.get('combined_data', pd.DataFrame())
        cve_correlations = analysis_results.get('cve_correlations', pd.DataFrame())
        text_similarities = analysis_results.get('text_similarities', pd.DataFrame())

        report = []
        report.append("# Vulnerability Database Cross-Reference Analysis Report")
        report.append("\n## Data Collection Summary")
        report.append(f"- Total vulnerabilities collected: {len(combined_data)}")
        report.append(f"- US-NVD vulnerabilities: {len(self.nvd.data)}")
        report.append(f"- CNNVD vulnerabilities: {len(self.cnnvd.data)}")
        report.append(f"- JVN vulnerabilities: {len(self.jvn.data)}")

        report.append("\n## Direct Correlations")
        report.append(f"- Total CVE correlations found: {len(cve_correlations)}")

        # Check if we have the sources_count column
        if not cve_correlations.empty and 'sources_count' in cve_correlations.columns:
            in_all_three = len(cve_correlations[cve_correlations['sources_count'] == 3]) if 'sources_count' in cve_correlations.columns else 0
            in_two = len(cve_correlations[cve_correlations['sources_count'] == 2]) if 'sources_count' in cve_correlations.columns else 0
            report.append(f"- Vulnerabilities present in all three databases: {in_all_three}")
            report.append(f"- Vulnerabilities present in two databases: {in_two}")

        report.append("\n## Text Similarity Analysis")
        report.append(f"- Potential matches based on description similarity: {len(text_similarities)}")
        if not text_similarities.empty and 'similarity' in text_similarities.columns:
            report.append(f"- Average similarity score: {text_similarities['similarity'].mean():.2f}")
            report.append(f"- Highest similarity score: {text_similarities['similarity'].max():.2f}")

            # List top 5 most similar vulnerabilities if available
            if len(text_similarities) > 0:
                report.append("\n### Top Similar Vulnerabilities")
                top_similarities = text_similarities.sort_values('similarity', ascending=False).head(5)
                for _, row in top_similarities.iterrows():
                    report.append(f"- {row['id1']} ({row['source1']}) and {row['id2']} ({row['source2']}): {row['similarity']:.2f} similarity")

        report.append("\n## Coverage Analysis")

        # Extract unique CVE IDs from each database
        nvd_ids = set(self.nvd.data['cve_id'].dropna()) if 'cve_id' in self.nvd.data.columns else set()
        cnnvd_ids = set(self.cnnvd.data['cve_id'].dropna()) if 'cve_id' in self.cnnvd.data.columns else set()
        jvn_ids = set(self.jvn.data['cve_id'].dropna()) if 'cve_id' in self.jvn.data.columns else set()

        total_unique_cves = len(nvd_ids.union(cnnvd_ids).union(jvn_ids))
        report.append(f"- Total unique CVEs: {total_unique_cves}")

        unique_to_nvd = len(nvd_ids - cnnvd_ids - jvn_ids)
        unique_to_cnnvd = len(cnnvd_ids - nvd_ids - jvn_ids)
        unique_to_jvn = len(jvn_ids - nvd_ids - cnnvd_ids)

        report.append(f"- Unique to US-NVD: {unique_to_nvd}")
        report.append(f"- Unique to CNNVD: {unique_to_cnnvd}")
        report.append(f"- Unique to JVN: {unique_to_jvn}")

        # Add CVSS statistics if available
        if 'cvss_stats' in analysis_results:
            report.append("\n## CVSS Score Analysis")
            cvss_stats = analysis_results['cvss_stats']
            report.append("### CVSS Scores by Source")
            report.append(f"```\n{cvss_stats}\n```")

        # Add time series insights if available
        if 'time_series' in analysis_results:
            report.append("\n## Temporal Analysis")
            time_series = analysis_results['time_series']
            report.append("- Temporal distribution analysis performed")
            if isinstance(time_series, pd.DataFrame) and not time_series.empty:
                report.append(f"- Data spans from {time_series.index.min().strftime('%Y-%m-%d')} to {time_series.index.max().strftime('%Y-%m-%d')}")
                report.append(f"- Peak month for vulnerabilities: {time_series.sum(axis=1).idxmax().strftime('%Y-%m')}")

        # Add PCA analysis insights if available
        if 'pca_df' in analysis_results and 'variance_ratio' in analysis_results:
            report.append("\n## Principal Component Analysis")
            variance_ratio = analysis_results['variance_ratio']
            report.append(f"- PCA performed on numeric vulnerability features")
            report.append(f"- Explained variance: {', '.join([f'{v:.2%}' for v in variance_ratio])}")
            report.append(f"- Visualization saved as: pca_analysis.png")

        # Add timestamp to the report
        report.append("\n## Report Generated")
        report.append(f"- Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

        # Write report to file
        with open('vulnerability_analysis_report.md', 'w') as f:
            f.write('\n'.join(report))

        return '\n'.join(report)

    def save_all_data(self):
        """Save all collected data to CSV files"""
        logger.info("Saving all collected data to files...")
        self.nvd.save_data()
        self.cnnvd.save_data()
        self.jvn.save_data()

        # Save combined data
        if not self.correlation_engine.combined_data.empty:
            self.correlation_engine.combined_data.to_csv('combined_vulnerabilities.csv', index=False)
            logger.info(f"Saved {len(self.correlation_engine.combined_data)} records to combined_vulnerabilities.csv")

# Main execution logic
def main():
    """Main function to execute the vulnerability database research framework"""
    logger.info("Starting Vulnerability Database Cross-Reference Framework")

    try:
        # Create framework instance
        framework = VulnResearchFramework()

        # Collect data (limit pages for demonstration)
        framework.collect_all_data(nvd_pages=1, cnnvd_pages=1)

        # Analyze data
        analysis_results = framework.analyze_data()

        # Generate report
        report = framework.generate_report(analysis_results)
        logger.info("Analysis complete. Report generated: vulnerability_analysis_report.md")

        # Save all data to CSV files
        framework.save_all_data()

        print("="*80)
        print("ANALYSIS SUMMARY")
        print("="*80)
        print(f"Total vulnerabilities analyzed: {len(analysis_results['combined_data'])}")
        print(f"CVE correlations found: {len(analysis_results['cve_correlations'])}")
        print(f"Similar vulnerabilities identified: {len(analysis_results['text_similarities'])}")
        print("="*80)
        print("See vulnerability_analysis_report.md for full details")

    except Exception as e:
        logger.error(f"Error in main execution: {e}", exc_info=True)  # Added exc_info to get full stack trace
        print(f"An error occurred: {e}")
        print("Check the log for details")

if __name__ == "__main__":
    main()



ANALYSIS SUMMARY
Total vulnerabilities analyzed: 2006
CVE correlations found: 2
Similar vulnerabilities identified: 0
See vulnerability_analysis_report.md for full details
