In [61]:
import requests
import json

# Direct API keys (replace these with your actual keys)
ABUSEIPDB_API_KEY = '7293c9adfec5ae491651890e2ec3ddc43d4f8f3474051fc6c20a442aa27237a1ae53814a1446f145'  # Replace with your actual AbuseIPDB API key
VIRUSTOTAL_API_KEY = 'dd4f9d70190ae332a2749b2ef0413d9b15c6300ff711fc7c124cb6d417ff57dc'  # Replace with your actual VirusTotal API key

# Function to fetch data from AbuseIPDB
def fetch_abuseipdb_data(ip):
    url = f'https://api.abuseipdb.com/api/v2/check'
    headers = {
        'Key': ABUSEIPDB_API_KEY,
        'Accept': 'application/json'
    }
    params = {
        'ipAddress': ip,
        'maxAgeInDays': '90'  # Limit to data from the last 90 days
    }

    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()  # Check if the request was successful
        data = response.json()  # Parse JSON response
        return data
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from AbuseIPDB: {e}")
        return None

# Function to fetch data from VirusTotal
def fetch_virustotal_data(ip):
    url = f"https://www.virustotal.com/api/v3/ip_addresses/{ip}"
    headers = {
        'x-apikey': VIRUSTOTAL_API_KEY
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Check if the request was successful
        data = response.json()  # Parse JSON response
        return data
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from VirusTotal: {e}")
        return None

# Example of fetching data for a specific IP address (can be done daily or on a schedule)
def fetch_and_process_data(ip):
    abuseipdb_data = fetch_abuseipdb_data(ip)
    virustotal_data = fetch_virustotal_data(ip)

    # Let's check the fetched data and process it (just printing for now)
    if abuseipdb_data:
        print("AbuseIPDB Data: ", json.dumps(abuseipdb_data, indent=4))

    if virustotal_data:
        print("VirusTotal Data: ", json.dumps(virustotal_data, indent=4))

# Example to fetch and process data for a specific IP
fetch_and_process_data('8.8.8.8')  # Example IP address (can be set dynamically or in a loop)


AbuseIPDB Data:  {
    "data": {
        "ipAddress": "8.8.8.8",
        "isPublic": true,
        "ipVersion": 4,
        "isWhitelisted": true,
        "abuseConfidenceScore": 0,
        "countryCode": "US",
        "usageType": "Content Delivery Network",
        "isp": "Google LLC",
        "domain": "google.com",
        "hostnames": [
            "dns.google"
        ],
        "isTor": false,
        "totalReports": 205,
        "numDistinctUsers": 54,
        "lastReportedAt": "2025-03-28T09:03:13+00:00"
    }
}
VirusTotal Data:  {
    "data": {
        "id": "8.8.8.8",
        "type": "ip_address",
        "links": {
            "self": "https://www.virustotal.com/api/v3/ip_addresses/8.8.8.8"
        },
        "attributes": {
            "country": "US",
            "whois": "NetRange: 8.8.8.0 - 8.8.8.255\nCIDR: 8.8.8.0/24\nNetName: GOGL\nNetHandle: NET-8-8-8-0-2\nParent: NET8 (NET-8-0-0-0-0)\nNetType: Direct Allocation\nOriginAS: \nOrganization: Google LLC (GOGL)\nRegDate: 2

In [62]:
import pandas as pd
import re
from datetime import datetime

# Function to validate IP addresses (basic validation for IPv4)
def validate_ip(ip):
    """Validate if the given string is a valid IP address (IPv4)"""
    return bool(re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip))

# Function to standardize and clean data
def clean_data(abuseipdb_data, virustotal_data):
    # Combine data from both APIs into a list
    combined_data = []

    # Extract relevant data from AbuseIPDB (fields: ipAddress, lastReportedAt, totalReports)
    if 'data' in abuseipdb_data:
        abuse_data = abuseipdb_data['data']
        combined_data.append({
            'ip_address': abuse_data.get('ipAddress', ''),
            'timestamp': abuse_data.get('lastReportedAt', ''),
            'reports_count': abuse_data.get('totalReports', 0),
            'abuse_confidence_score': abuse_data.get('abuseConfidenceScore', 0),
            'country_code': abuse_data.get('countryCode', ''),
            'usage_type': abuse_data.get('usageType', ''),
            'is_public': abuse_data.get('isPublic', False),
            'domain': abuse_data.get('domain', '')
        })

    # Extract relevant data from VirusTotal (fields: ip_address, last_analysis_date)
    if 'data' in virustotal_data:
        virustotal_data = virustotal_data['data']
        combined_data.append({
            'ip_address': virustotal_data.get('id', ''),
            'timestamp': virustotal_data['attributes'].get('last_analysis_date', ''),
            'last_https_certificate': virustotal_data['attributes'].get('last_https_certificate', {}).get('cert_signature', {}).get('signature', ''),
            'reputation': virustotal_data['attributes'].get('reputation', 0),
            'last_analysis_stats': virustotal_data['attributes'].get('last_analysis_stats', {}),
        })

    # Convert the combined data into a pandas DataFrame
    df = pd.DataFrame(combined_data)

    # Remove duplicates based on 'ip_address'
    df = df.drop_duplicates(subset=['ip_address'])

    # Validate IP address format and clean the 'ip_address' column
    df['ip_address'] = df['ip_address'].apply(lambda x: x if validate_ip(x) else None)

    # Normalize the 'timestamp' column into a consistent datetime format
    def standardize_timestamp(timestamp):
        try:
            # Attempt to parse timestamp (ISO 8601 format, e.g., "2025-03-28T08:02:57+00:00")
            return datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S%z") if timestamp else None
        except (ValueError, TypeError):
            return None

    df['timestamp'] = df['timestamp'].apply(standardize_timestamp)

    # Ensure consistency in field names
    df = df.rename(columns={'ip_address': 'IP Address', 'timestamp': 'Timestamp'})

    return df

# Example data from AbuseIPDB and VirusTotal (replace with actual data)
abuseipdb_data = {
    "data": {
        "ipAddress": "8.8.8.8",
        "isPublic": True,
        "ipVersion": 4,
        "isWhitelisted": True,
        "abuseConfidenceScore": 0,
        "countryCode": "US",
        "usageType": "Content Delivery Network",
        "isp": "Google LLC",
        "domain": "google.com",
        "hostnames": ["dns.google"],
        "isTor": False,
        "totalReports": 205,
        "numDistinctUsers": 54,
        "lastReportedAt": "2025-03-28T08:02:57+00:00"
    }
}

virustotal_data = {
    "data": {
        "id": "8.8.8.8",
        "type": "ip_address",
        "attributes": {
            "last_https_certificate": {
                "cert_signature": {
                    "signature_algorithm": "sha256RSA",
                    "signature": "9f4b71aa2a0f0b6612b0aee30f5c671c3f799b7eeeccee5faf6ac357f4ef208e6296b82c585a63ae774ac08d26bfe3606b1553530379562f2a75f1df439aed90903e71a6bb6aa54321d74d014e70a1090318fd9c4c8af44992f005f6729f16c2c18382a45f408092002862891e1c4c42ec1b00bbe0ebc683b0901da3332d94536ea74d2441c2a66cf61f1b6f0b5654c9e8374499778701fa50d89ef2d4414f0dc1b5fb535b6bc819cca7155006e792482ef1d86498515aab5d22d12a89c779124586eebeaf2dd7b1c2f14effc97eb8e1101b2921d5450c54afaa014a0333ff7075f16b735a90e68bfe6c08b6ca3bf46245b1f7b03eaa291d4a2fd21909d4a464"
                },
                "extensions": {
                    "key_usage": [
                        "digitalSignature",
                        "keyEncipherment"
                    ],
                    "extended_key_usage": [
                        "serverAuth"
                    ]
                },
                "validity": {
                    "not_after": "2025-06-02 08:37:49",
                    "not_before": "2025-03-10 08:37:50"
                },
                "reputation": 548,
                "last_analysis_stats": {
                    "malicious": 0,
                    "suspicious": 0,
                    "undetected": 31,
                    "harmless": 63
                }
            }
        }
    }
}

# Clean the data
cleaned_data = clean_data(abuseipdb_data, virustotal_data)

# Display the cleaned data
print("Cleaned Data: ")
print(cleaned_data)


Cleaned Data: 
  IP Address                 Timestamp  reports_count  abuse_confidence_score  \
0    8.8.8.8 2025-03-28 08:02:57+00:00          205.0                     0.0   

  country_code                usage_type is_public      domain  \
0           US  Content Delivery Network      True  google.com   

  last_https_certificate  reputation last_analysis_stats  
0                    NaN         NaN                 NaN  


In [63]:
pip install pymongo




In [64]:
!pip install --upgrade pymongo certifi




In [65]:
from pymongo import MongoClient

# Replace with your MongoDB Atlas connection string
uri = "mongodb+srv://Priyadharshini_N-1522:is9Ln6l0DRpOcBqG@cluster0.gdduvv6.mongodb.net/ip_data?retryWrites=true&w=majority"

# Connect to MongoDB using the connection string
client = MongoClient(uri)

# Access the 'ip_data' collection in the 'ip_data' database
db = client['ip_data']  # This is your database
collection = db['ip_data']  # This is your collection

# Example data to insert
cleaned_data = [
    {
        "ip_address": "8.8.8.8",
        "timestamp": "2025-03-28 08:02:57+00:00",
        "reports_count": 205,
        "abuse_confidence_score": 0.0,
        "country_code": "US",
        "usage_type": "Content Delivery Network",
        "is_public": True,
        "domain": "google.com",
        "last_https_certificate": None,
        "reputation": None,
        "last_analysis_stats": None
    }
]

# Insert the cleaned data into MongoDB
collection.insert_many(cleaned_data)

# Create an index on the IP address for efficient querying
collection.create_index('ip_address')

print("Data inserted and index created!")


ServerSelectionTimeoutError: SSL handshake failed: ac-maqnxcc-shard-00-01.gdduvv6.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms),SSL handshake failed: ac-maqnxcc-shard-00-02.gdduvv6.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms),SSL handshake failed: ac-maqnxcc-shard-00-00.gdduvv6.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 67e674ec4b3ca1c01ab6b062, topology_type: ReplicaSetNoPrimary, servers: [<ServerDescription ('ac-maqnxcc-shard-00-00.gdduvv6.mongodb.net', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('SSL handshake failed: ac-maqnxcc-shard-00-00.gdduvv6.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>, <ServerDescription ('ac-maqnxcc-shard-00-01.gdduvv6.mongodb.net', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('SSL handshake failed: ac-maqnxcc-shard-00-01.gdduvv6.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>, <ServerDescription ('ac-maqnxcc-shard-00-02.gdduvv6.mongodb.net', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('SSL handshake failed: ac-maqnxcc-shard-00-02.gdduvv6.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1006) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>