In [1]:
import json

def parse_json_to_txt(json_file, txt_file):
    """
    Parses a JSON file to extract tweet IDs and coordinates, then saves them to a text file.

    Args:
        json_file (str): Path to the input JSON file.
        txt_file (str): Path to the output text file.
    """
    try:
        # Legge il file JSON
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Lista per salvare i risultati
        results = []

        def extract_locations(obj):
            """Recursively extracts tweet IDs and coordinates."""
            if isinstance(obj, dict):
                tweet_id = obj.get('_id')  # Assume che l'ID sia nel campo '_id'
                detected_locations = obj.get('detected_locations', [])

                # Cerca le coordinate in detected_locations
                if tweet_id and detected_locations:
                    for location in detected_locations:
                        geometry = location.get('geometry', {})
                        coordinates = geometry.get('coordinates')
                        if coordinates:
                            results.append(f"{tweet_id}: {coordinates}")

                # Esplora altri campi del dizionario
                for key, value in obj.items():
                    extract_locations(value)

            elif isinstance(obj, list):
                for item in obj:
                    extract_locations(item)

        # Esegue l'estrazione dei dati
        extract_locations(data)

        # Salva i risultati in un file di testo
        with open(txt_file, 'w', encoding='utf-8') as f:
            for result in results:
                f.write(result + '\n')

        print(f"Salvati {len(results)} tweet con location in {txt_file}.")

    except Exception as e:
        print(f"Errore: {e}")

# Percorsi dei file
json_input_path = r'C:\Users\samue\OneDrive\Desktop\japanEarthquakesDataset.json'
txt_output_path = r'C:\Users\samue\OneDrive\Desktop\locations.txt'

# Esegui il parsing
parse_json_to_txt(json_input_path, txt_output_path)


Salvati 55109 tweet con location in C:\Users\samue\OneDrive\Desktop\locations.txt.


In [None]:
import json
from geopy.distance import geodesic

invalid_coords = []

def parse_coordinates(location, tweet_id):
    """
    Validates and parses a string containing coordinates.
    
    Args:
        location (str): String containing the coordinates.
        tweet_id (str): ID of the tweet for reference.
    
    Returns:
        tuple: A tuple of (latitude, longitude, tweet_id) if valid, None otherwise.
    """
    try:
        location = location.strip('[]')  # Remove brackets
        lat, lon = map(float, location.split(','))

        # Adjust invalid latitude/longitude values
        if lat > 90:
            print(f"Latitude > 90 found for tweet ID {tweet_id}. Adjusting...")
            lat = 90
        if lon > 180:
            print(f"Longitude > 180 found for tweet ID {tweet_id}. Adjusting...")
            lon = 180

        # Validate latitude and longitude ranges
        if -90 <= lat <= 90 and -180 <= lon <= 180:
            return lat, lon, tweet_id
        else:
            invalid_coords.append(f"{tweet_id}: {location}")  # Save invalid coordinates
            print(f"Invalid range for coordinates: {tweet_id}: {location}")
            return None
    except ValueError:
        invalid_coords.append(f"{tweet_id}: {location}")  # Save invalid coordinates
        print(f"Error parsing coordinates: {tweet_id}: {location}")
        return None


def group_nearby_locations(txt_file, output_file, max_distance_km):
    """
    Groups nearby locations based on a maximum distance.
    
    Args:
        txt_file (str): Path to the text file containing tweet IDs and locations.
        output_file (str): Path to the output file for grouped locations.
        max_distance_km (float): Maximum distance in kilometers to group locations.
    """
    try:
        # Read tweet IDs and locations from the text file
        print(f"Reading tweet IDs and locations from: {txt_file}")
        with open(txt_file, 'r', encoding='utf-8') as f:
            lines = [line.strip() for line in f if line.strip()]

        if not lines:
            print("No tweet IDs or locations found in the input file.")
            return

        print(f"Found {len(lines)} total tweet-location pairs. Parsing...")

        # Parse valid tweet IDs and coordinates
        coordinates = []
        for line in lines:
            try:
                tweet_id, location = line.split(': ')
                parsed = parse_coordinates(location, tweet_id)
                if parsed:
                    coordinates.append(parsed)
            except ValueError:
                print(f"Invalid line format: {line}")

        if not coordinates:
            print("No valid tweet-location pairs found.")
            return

        print(f"Parsed {len(coordinates)} valid tweet-location pairs. Grouping...")

        # Group locations by proximity
        groups = []
        while coordinates:
            base_location = coordinates.pop(0)
            group = [base_location]

            for other_location in coordinates[:]:
                if geodesic(base_location[:2], other_location[:2]).kilometers <= max_distance_km:
                    group.append(other_location)
                    coordinates.remove(other_location)

            groups.append(group)

        # Write grouped locations to the output file
        print(f"Writing {len(groups)} groups to the output file: {output_file}")
        with open(output_file, 'w', encoding='utf-8') as f:
            for i, group in enumerate(groups, 1):
                f.write(f"Group {i}:\n")
                for loc in group:
                    f.write(f"  {loc[2]}: {loc[:2]}\n")  # Write tweet ID and coordinates
                f.write("\n")

        print(f"Successfully grouped tweet-location pairs and saved to {output_file}.")

        # Save invalid coordinates
        if invalid_coords:
            invalid_file = 'invalid_coordinates.txt'
            print(f"Saving invalid coordinates to: {invalid_file}")
            with open(invalid_file, 'w', encoding='utf-8') as f:
                for coord in invalid_coords:
                    f.write(coord + '\n')
            print(f"Saved {len(invalid_coords)} invalid coordinates.")

    except Exception as e:
        print(f"Error: {e}")


txt_input_path = r'C:\\Users\\samue\\OneDrive\\Desktop\\locations.txt'
output_grouped_path = r'C:\\Users\\samue\\OneDrive\\Desktop\\grouped_locations.txt'

max_distance = 10.0

group_nearby_locations(txt_input_path, output_grouped_path, max_distance)


Reading tweet IDs and locations from: C:\\Users\\samue\\OneDrive\\Desktop\\locations.txt
Found 55109 total tweet-location pairs. Parsing...
Latitude > 90 found for tweet ID 1366165675643822080. Adjusting...
Latitude > 90 found for tweet ID 1366161386187993089. Adjusting...
Latitude > 90 found for tweet ID 1366153608169840641. Adjusting...
Invalid range for coordinates: 1366153125225062400: -155.0815803, 19.7073734
Latitude > 90 found for tweet ID 1366153125225062400. Adjusting...
Latitude > 90 found for tweet ID 1366150331076288513. Adjusting...
Latitude > 90 found for tweet ID 1366147306249101312. Adjusting...
Latitude > 90 found for tweet ID 1366141759751815168. Adjusting...
Latitude > 90 found for tweet ID 1366139316435517440. Adjusting...
Invalid range for coordinates: 1366132251377958920: -122.2485492, 47.2640129
Latitude > 90 found for tweet ID 1366132251377958920. Adjusting...
Latitude > 90 found for tweet ID 1366128969033146370. Adjusting...
Latitude > 90 found for tweet ID 136

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

