In [4]:
import ipaddress
import os
import pandas as pd
import json

# Define the input and output folder paths
input_folder = '../data/csv_chunks'  # Path where CSV files are stored
output_folder = '../data/ip_addresses'  # Path where JSON files will be saved

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)
def is_ipv4(ip):
    try:
        return isinstance(ipaddress.ip_address(ip), ipaddress.IPv4Address)
    except ValueError:
        return False

# Loop through each CSV file in the input folder
for file_name in os.listdir(input_folder):
    if file_name.endswith('.csv'):
        file_path = os.path.join(input_folder, file_name)
        
        # Read the CSV file and extract unique IPs
        df = pd.read_csv(file_path, usecols=['CLNT_RMT_IP'])
        
        # Filter for unique IPv4 addresses only
        unique_ipv4_ips = [ip for ip in df['CLNT_RMT_IP'].dropna().unique() if is_ipv4(ip)]

        # Format the data as required
        data = {"ip_address": unique_ipv4_ips}
        
        # Create a JSON file for each CSV file
        json_file_name = f"{os.path.splitext(file_name)[0]}_unique_ipv4_ips.json"
        json_file_path = os.path.join(output_folder, json_file_name)
        
        # Write the data to a JSON file
        with open(json_file_path, 'w') as json_file:
            json.dump(data, json_file, indent=4)

        print(f"Unique IPv4 IPs from {file_name} saved to {json_file_path}")


Unique IPv4 IPs from data_chunk_0.csv saved to ../data/ip_addresses\data_chunk_0_unique_ipv4_ips.json
Unique IPv4 IPs from data_chunk_1.csv saved to ../data/ip_addresses\data_chunk_1_unique_ipv4_ips.json
Unique IPv4 IPs from data_chunk_10.csv saved to ../data/ip_addresses\data_chunk_10_unique_ipv4_ips.json
Unique IPv4 IPs from data_chunk_11.csv saved to ../data/ip_addresses\data_chunk_11_unique_ipv4_ips.json
Unique IPv4 IPs from data_chunk_12.csv saved to ../data/ip_addresses\data_chunk_12_unique_ipv4_ips.json
Unique IPv4 IPs from data_chunk_13.csv saved to ../data/ip_addresses\data_chunk_13_unique_ipv4_ips.json
Unique IPv4 IPs from data_chunk_14.csv saved to ../data/ip_addresses\data_chunk_14_unique_ipv4_ips.json
Unique IPv4 IPs from data_chunk_15.csv saved to ../data/ip_addresses\data_chunk_15_unique_ipv4_ips.json
Unique IPv4 IPs from data_chunk_16.csv saved to ../data/ip_addresses\data_chunk_16_unique_ipv4_ips.json
Unique IPv4 IPs from data_chunk_17.csv saved to ../data/ip_addresses

In [1]:
import os
import json

# Path to the folder containing JSON files
folder_path = '../results'
output_file = 'vpn_ips.txt'

# List to store IPs with is_vpn = True
vpn_ips = []

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)
        
        # Open and load the JSON file
        with open(file_path, 'r') as file:
            data = json.load(file)
            
            # Iterate over each IP in the JSON data
            for ip, info in data.items():
                # Check if 'is_vpn' is true
                if info.get('is_vpn') == True:
                    vpn_ips.append(ip)

# Write the collected IPs to a text file
with open(output_file, 'w') as file:
    for ip in vpn_ips:
        file.write(ip + '\n')

print(f"IPs with is_vpn=true have been written to {output_file}.")


IPs with is_vpn=true have been written to vpn_ips.txt.
