In [2]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [1]:
# Import the necessary libraries
import pandas as pd
import numpy as np

In [4]:
# Clean the fault lines raw data

def flatten_coordinates(fault_data):
    """Helper function to flatten nested coordinate lists"""
    flattened = []
    for item in fault_data:
        # Check if item is a list of coordinates or a single coordinate
        if isinstance(item[0], list):
            # If double brackets, extend the flattened list
            flattened.extend(item)
        else:
            # If single coordinate pair, append it
            flattened.append(item)
    return flattened

def convert_fault_lines_to_df(fault_data, fault_type):
    """Function to convert fault lines data to a DataFrame"""
    # Initialize empty lists to store the flattened data
    latitudes = []
    longitudes = []
    types = []
    
    # Flatten the coordinates if needed
    coords = flatten_coordinates(fault_data)
    
    # Process coordinate pairs
    for coord in coords:
        latitudes.append(coord[0])
        longitudes.append(coord[1])
        types.append(fault_type)
    
    # Create DataFrame
    df = pd.DataFrame({
        'latitude': latitudes,
        'longitude': longitudes,
        'type': types
    })
    
    return df

# Load the fault line data from file

fault_lines_df = pd.DataFrame()

with open('fault_lines_raw.txt', 'r') as file:
    content = file.read()
    
    # Split content by semicolon or newline to get each fault type definition
    fault_definitions = [line.strip() for line in content.split(';')]
    
    # Process each fault definition
    for definition in fault_definitions:
        if '=' in definition:
            fault_name, fault_data = definition.split('=')
            fault_name = fault_name.strip()
            try:
                # Safely evaluate the string representation of lists
                fault_coords = eval(fault_data.strip())
                
                # Convert to DataFrame with the fault type name
                fault_type = fault_name.lower()  # Convert to lowercase for consistency
                fault_df = convert_fault_lines_to_df(fault_coords, fault_type)
                
                # Append to main DataFrame
                fault_lines_df = pd.concat([fault_lines_df, fault_df], ignore_index=True)
            except:
                print(f"Could not process fault type: {fault_name}")

# Display first few rows
ws:")
print(fault_lines_df.head())
print("\nUnique fault types:")

print(fault_lines_df.info())
fault_lines_df.head()

In [5]:
# Load the fault line data from file
fault_lines_df = pd.DataFrame()

with open('fault_lines_raw.txt', 'r') as file:
    content = file.read()
    
    # Split content by semicolon or newline to get each fault type definition
    fault_definitions = [line.strip() for line in content.split(';')]
    
    # Process each fault definition
    for definition in fault_definitions:
        if '=' in definition:
            fault_name, fault_data = definition.split('=')
            fault_name = fault_name.strip()
            try:
                # Safely evaluate the string representation of lists
                fault_coords = eval(fault_data.strip())
                
                # Convert to DataFrame with the fault type name
                fault_type = fault_name.lower()  # Convert to lowercase for consistency
                fault_df = convert_fault_lines_to_df(fault_coords, fault_type)
                
                # Append to main DataFrame
                fault_lines_df = pd.concat([fault_lines_df, fault_df], ignore_index=True)
            except:
                print(f"Could not process fault type: {fault_name}")

# Display first few rows
fault_lines_df.head()

Unnamed: 0,latitude,longitude,type
0,33.311475,35.739074,mainss
1,33.311052,35.738387,mainss
2,33.309585,35.736827,mainss
3,33.308451,35.735637,mainss
4,33.306772,35.733867,mainss


In [9]:
# Clean the data a bit
def clean_data(df):
    # Create a proper index
    df = df.reset_index()
    # Extract secondary type from 'type' column
    df['secondary_type'] = df['type'].str.replace(r'^main|^quat_', '', regex=True)
    # Extract the start of 'type' column
    df['main_type'] = df['type'].str[:4]
    # we don't need the original 'type' column anymore
    df = df.drop(columns=['type'])
    return df

fault_lines_df_clean = clean_data(fault_lines_df.copy())
fault_lines_df_clean.head()

Unnamed: 0,index,latitude,longitude,secondary_type,main_type
0,0,33.311475,35.739074,ss,main
1,1,33.311052,35.738387,ss,main
2,2,33.309585,35.736827,ss,main
3,3,33.308451,35.735637,ss,main
4,4,33.306772,35.733867,ss,main


In [None]:
def create_line_segments(df):
    """Convert points to line segments with start and end coordinates"""
    # Create empty lists for the new columns
    start_lat = []
    start_long = []
    end_lat = []
    end_long = []
    main_types = []
    secondary_types = []
    
    # Get unique combinations of main_type and secondary_type
    unique_types = df[['main_type', 'secondary_type']].drop_duplicates()
    
    #print(f"Number of distinct fault type groups: {len(unique_types)}")
    
    # Process each fault type combination
    for _, (main_type, secondary_type) in unique_types.iterrows():
        # Get points for this fault type
        mask = (df['main_type'] == main_type) & (df['secondary_type'] == secondary_type)
        group = df[mask].copy().reset_index(drop=True)
        
        # Get the number of points in this group
        n_points = len(group)
        #print(f"Processing {main_type}_{secondary_type}: {n_points} points")
        
        # Create line segments from consecutive points
        for i in range(0, n_points - 1):
            # Create one segment between each pair of consecutive points
            start_lat.append(group.iloc[i]['latitude'])
            start_long.append(group.iloc[i]['longitude'])
            end_lat.append(group.iloc[i + 1]['latitude'])
            end_long.append(group.iloc[i + 1]['longitude'])
            main_types.append(main_type)
            secondary_types.append(secondary_type)
    
    # Create new DataFrame with line segments
    segments_df = pd.DataFrame({
        'start_latitude': start_lat,
        'start_longitude': start_long,
        'end_latitude': end_lat,
        'end_longitude': end_long,
        'main_type': main_types,
        'secondary_type': secondary_types
    })
    
    #print(f"\nOriginal points: {len(df)}")
    #print(f"Created segments: {len(segments_df)}")
    #print(f"Expected segments: {len(df) - len(unique_types)}")  # Each group should have n-1 segments for n points

    return segments_df

# Convert the cleaned point data to line segments
fault_lines_segments = create_line_segments(fault_lines_df_clean)

# Display first few rows to verify the structure
fault_lines_segments = fault_lines_segments.reset_index()
fault_lines_segments.head()

Unnamed: 0,start_latitude,start_longitude,end_latitude,end_longitude,main_type,secondary_type
0,33.311475,35.739074,33.311052,35.738387,main,ss
1,33.311052,35.738387,33.309585,35.736827,main,ss
2,33.309585,35.736827,33.308451,35.735637,main,ss
3,33.308451,35.735637,33.306772,35.733867,main,ss
4,33.306772,35.733867,33.305874,35.733202,main,ss


In [18]:
#looks OK! let's save it to a CSV file for Tableau
fault_lines_segments.to_csv("D:\\Projects\\Tableau\\My Creations\\Israel's Earthquakes\\fault_lines_cleaned.csv", index=False)