In [1]:
# 1_datasets/export_raw_samples.py
"""
Export sample data from SatNOGS database for capstone project.
Samples large tables and exports full small tables.
"""

import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import time
import os
from datetime import datetime

print(f"üì° SatNOGS Data Export Script")
print(f"Execution started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*70)

# Database connection
DB_USER = "root"
DB_PASSWORD = "123456789"
DB_HOST = "127.0.0.1"
DB_PORT = "3306"
DB_NAME = "satnogs"

engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

# Create directories
os.makedirs("1_datasets/raw", exist_ok=True)
os.makedirs("1_datasets/metadata", exist_ok=True)

def run_query(sql, desc="Query"):
    """Run SQL query with timing."""
    print(f"\nüîç {desc}...")
    start = time.time()
    df = pd.read_sql_query(sql, engine)
    print(f"‚úÖ Retrieved {len(df):,} rows in {time.time()-start:.2f}s")
    return df

üì° SatNOGS Data Export Script
Execution started: 2025-12-11 14:48:32


In [2]:
# ============================================================================
# 2. Export TIMELINE and METADATA first
# ============================================================================

print("\n" + "="*70)
print("STEP 1: EXPORTING METADATA AND TIMELINE")
print("="*70)

# Dataset timeline
timeline = run_query("""
    SELECT 
        MIN(start) AS first_observation,
        MAX(start) AS last_observation,
        COUNT(*) AS total_observations,
        COUNT(DISTINCT ground_station_id) AS unique_stations,
        COUNT(DISTINCT sat_id) AS unique_satellites
    FROM base_observation
    WHERE start IS NOT NULL
""", "Dataset timeline overview")

timeline.to_csv("1_datasets/metadata/dataset_timeline.csv", index=False)
print("üìä Timeline saved to: 1_datasets/metadata/dataset_timeline.csv")

# Table statistics
table_stats = run_query("""
    SELECT 
        TABLE_NAME as table_name,
        TABLE_ROWS as estimated_rows,
        CREATE_TIME as created,
        UPDATE_TIME as last_updated
    FROM information_schema.TABLES 
    WHERE TABLE_SCHEMA = 'newdata'
    ORDER BY TABLE_ROWS DESC
""", "Table statistics")

table_stats.to_csv("1_datasets/metadata/table_statistics.csv", index=False)
print("üìà Table statistics saved")



STEP 1: EXPORTING METADATA AND TIMELINE

üîç Dataset timeline overview...
‚úÖ Retrieved 1 rows in 77.91s
üìä Timeline saved to: 1_datasets/metadata/dataset_timeline.csv

üîç Table statistics...
‚úÖ Retrieved 0 rows in 0.01s
üìà Table statistics saved


In [3]:
# ============================================================================
# 3. Export FULL small tables (< 10K rows)
# ============================================================================

print("\n" + "="*70)
print("STEP 2: EXPORTING SMALL TABLES (FULL)")
print("="*70)

small_tables = [
    'base_antennatype',      # 17 rows
    'base_stationtype',      # 1 row
    'base_operator',         # 6 rows
    'base_mode',             # 56 rows
    'base_telemetry',        # 185 rows
]

for table in small_tables:
    df = run_query(f"SELECT * FROM {table}", f"Exporting {table}")
    df.to_csv(f"1_datasets/raw/{table}_full.csv", index=False)
    print(f"üíæ Saved: 1_datasets/raw/{table}_full.csv")



STEP 2: EXPORTING SMALL TABLES (FULL)

üîç Exporting base_antennatype...
‚úÖ Retrieved 17 rows in 0.01s
üíæ Saved: 1_datasets/raw/base_antennatype_full.csv

üîç Exporting base_stationtype...
‚úÖ Retrieved 1 rows in 0.01s
üíæ Saved: 1_datasets/raw/base_stationtype_full.csv

üîç Exporting base_operator...
‚úÖ Retrieved 6 rows in 0.01s
üíæ Saved: 1_datasets/raw/base_operator_full.csv

üîç Exporting base_mode...
‚úÖ Retrieved 56 rows in 0.01s
üíæ Saved: 1_datasets/raw/base_mode_full.csv

üîç Exporting base_telemetry...
‚úÖ Retrieved 185 rows in 0.01s
üíæ Saved: 1_datasets/raw/base_telemetry_full.csv


In [4]:
# ============================================================================
# 4. Export MEDIUM tables with sampling if needed
# ============================================================================

print("\n" + "="*70)
print("STEP 3: EXPORTING MEDIUM TABLES")
print("="*70)

medium_tables = {
    'base_station': 'full',          # 3,912 rows
    'base_satelliteentry': 5000,     # Sample 5K from 9,759
    'base_transmitterentry': 5000,   # Sample 5K from 9,869
    'base_satellite': 3000,          # Sample 3K from 2,903
    'base_satelliteidentifier': 3000,# Sample 3K from 2,920
}

for table, sample_size in medium_tables.items():
    if sample_size == 'full':
        df = run_query(f"SELECT * FROM {table}", f"Exporting {table} (full)")
    else:
        df = run_query(f"""
            SELECT * FROM {table} 
            ORDER BY RAND() 
            LIMIT {sample_size}
        """, f"Exporting {table} (sample: {sample_size})")
    
    df.to_csv(f"1_datasets/raw/{table}_sample.csv", index=False)
    print(f"üíæ Saved: 1_datasets/raw/{table}_sample.csv")


STEP 3: EXPORTING MEDIUM TABLES

üîç Exporting base_station (full)...
‚úÖ Retrieved 3,912 rows in 0.34s
üíæ Saved: 1_datasets/raw/base_station_sample.csv

üîç Exporting base_satelliteentry (sample: 5000)...
‚úÖ Retrieved 5,000 rows in 0.69s
üíæ Saved: 1_datasets/raw/base_satelliteentry_sample.csv

üîç Exporting base_transmitterentry (sample: 5000)...
‚úÖ Retrieved 5,000 rows in 0.41s
üíæ Saved: 1_datasets/raw/base_transmitterentry_sample.csv

üîç Exporting base_satellite (sample: 3000)...
‚úÖ Retrieved 2,903 rows in 0.08s
üíæ Saved: 1_datasets/raw/base_satellite_sample.csv

üîç Exporting base_satelliteidentifier (sample: 3000)...
‚úÖ Retrieved 2,920 rows in 0.07s
üíæ Saved: 1_datasets/raw/base_satelliteidentifier_sample.csv


In [5]:
# ============================================================================
# 5. Export LARGE tables with strategic sampling
# ============================================================================

print("\n" + "="*70)
print("STEP 4: EXPORTING LARGE TABLES (STRATEGIC SAMPLING)")
print("="*70)

# Strategy: Get balanced sample of observations by status and year
print("\nüìä Getting observation sample with balanced status...")

# First, get status distribution
status_dist = run_query("""
    SELECT status, COUNT(*) as count
    FROM base_observation 
    GROUP BY status 
    ORDER BY count DESC
""", "Status distribution")

print("\nStatus distribution in full dataset:")
print(status_dist.to_string(index=False))

# Sample observations: 100K total, balanced by status where possible
observation_sample = run_query("""
    WITH status_counts AS (
        SELECT status, COUNT(*) as total
        FROM base_observation
        GROUP BY status
    )
    SELECT o.*
    FROM base_observation o
    WHERE (
        -- For common statuses, sample proportionally
        (o.status IN (100, -100, 0, -1000) AND RAND() < 50000.0 / (SELECT total FROM status_counts WHERE status = o.status))
        OR
        -- For rare statuses, take all
        (o.status NOT IN (100, -100, 0, -1000))
    )
    ORDER BY o.start
    LIMIT 100000
""", "Sampling 100K observations (balanced by status)")

observation_sample.to_csv("1_datasets/raw/base_observation_sample_100k.csv", index=False)
print(f"üíæ Saved: 1_datasets/raw/base_observation_sample_100k.csv ({len(observation_sample):,} rows)")

# Also get a time-based sample (latest observations)
latest_observations = run_query("""
    SELECT * 
    FROM base_observation 
    WHERE start >= '2025-01-01'
    ORDER BY start DESC
    LIMIT 50000
""", "Latest observations (2025)")

latest_observations.to_csv("1_datasets/raw/base_observation_latest_50k.csv", index=False)
print(f"üíæ Saved: 1_datasets/raw/base_observation_latest_50k.csv ({len(latest_observations):,} rows)")



STEP 4: EXPORTING LARGE TABLES (STRATEGIC SAMPLING)

üìä Getting observation sample with balanced status...

üîç Status distribution...
‚úÖ Retrieved 4 rows in 70.48s

Status distribution in full dataset:
 status   count
    100 6384729
   -100 2625204
      0 2536168
  -1000 1000140

üîç Sampling 100K observations (balanced by status)...
‚úÖ Retrieved 100,000 rows in 396.84s
üíæ Saved: 1_datasets/raw/base_observation_sample_100k.csv (100,000 rows)

üîç Latest observations (2025)...
‚úÖ Retrieved 50,000 rows in 5.33s
üíæ Saved: 1_datasets/raw/base_observation_latest_50k.csv (50,000 rows)


In [6]:
# ============================================================================
# 6. Export RELATIONSHIP tables
# ============================================================================

print("\n" + "="*70)
print("STEP 5: EXPORTING RELATIONSHIP TABLES")
print("="*70)

relationship_tables = {
    'base_antenna': 2000,           # Sample 2K from 4,437
    'base_frequencyrange': 2000,    # Sample 2K from 5,311
    'base_stationstatuslog': 10000, # Sample 10K from 298,893
}

for table, sample_size in relationship_tables.items():
    df = run_query(f"""
        SELECT * FROM {table} 
        ORDER BY RAND() 
        LIMIT {sample_size}
    """, f"Exporting {table} (sample: {sample_size})")
    
    df.to_csv(f"1_datasets/raw/{table}_sample.csv", index=False)
    print(f"üíæ Saved: 1_datasets/raw/{table}_sample.csv")


STEP 5: EXPORTING RELATIONSHIP TABLES

üîç Exporting base_antenna (sample: 2000)...
‚úÖ Retrieved 2,000 rows in 0.08s
üíæ Saved: 1_datasets/raw/base_antenna_sample.csv

üîç Exporting base_frequencyrange (sample: 2000)...
‚úÖ Retrieved 2,000 rows in 0.05s
üíæ Saved: 1_datasets/raw/base_frequencyrange_sample.csv

üîç Exporting base_stationstatuslog (sample: 10000)...
‚úÖ Retrieved 10,000 rows in 0.71s
üíæ Saved: 1_datasets/raw/base_stationstatuslog_sample.csv


In [7]:
# ============================================================================
# 7. Create DATA DICTIONARY
# ============================================================================

print("\n" + "="*70)
print("STEP 6: CREATING DATA DICTIONARY")
print("="*70)

# Create basic data dictionary from schema
data_dict = []

for table in ['base_observation', 'base_station', 'base_satelliteentry', 'base_transmitterentry']:
    # Get column info
    df_sample = pd.read_sql_query(f"SELECT * FROM {table} LIMIT 100", engine)
    
    for col in df_sample.columns:
        data_dict.append({
            'table': table,
            'column': col,
            'dtype': str(df_sample[col].dtype),
            'non_null_sample': df_sample[col].notnull().sum(),
            'sample_values': str(df_sample[col].dropna().unique()[:3].tolist() if df_sample[col].notnull().sum() > 0 else 'ALL_NULL')
        })

data_dict_df = pd.DataFrame(data_dict)
data_dict_df.to_csv("1_datasets/metadata/data_dictionary.csv", index=False)
print("üìñ Data dictionary saved to: 1_datasets/metadata/data_dictionary.csv")


STEP 6: CREATING DATA DICTIONARY
üìñ Data dictionary saved to: 1_datasets/metadata/data_dictionary.csv


In [12]:
# ============================================================================
# 8. Generate SUMMARY REPORT
# ============================================================================

print("\n" + "="*70)
print("STEP 7: GENERATING SUMMARY REPORT")
print("="*70)

# Calculate total exported data size
total_rows = 0
exported_files = []

for file in os.listdir("1_datasets/raw"):
    if file.endswith('.csv'):
        filepath = os.path.join("1_datasets/raw", file)
        df_temp = pd.read_csv(filepath, nrows=1)  # Just to check
        with open(filepath, 'r', encoding='utf-8') as f:
            row_count = sum(1 for line in f) - 1  # Subtract header
        
        exported_files.append({
            'file': file,
            'rows': row_count,
            'size_mb': os.path.getsize(filepath) / (1024*1024)
        })
        total_rows += row_count

summary_df = pd.DataFrame(exported_files)

# Create summary markdown
summary_md = f"""# üìä SatNOGS Dataset Exports
*Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*

## Dataset Overview
- **Source**: SatNOGS Database (MySQL/MariaDB)
- **Time Range**: {timeline['first_observation'].iloc[0]} to {timeline['last_observation'].iloc[0]}
- **Total Observations in DB**: {timeline['total_observations'].iloc[0]:,}
- **Unique Stations**: {timeline['unique_stations'].iloc[0]:,}
- **Unique Satellites**: {timeline['unique_satellites'].iloc[0]:,}

## Exported Files Summary
Total exported rows: {total_rows:,}

| File | Rows | Size (MB) |
|------|------|-----------|
"""

for _, row in summary_df.iterrows():
    summary_md += f"| {row['file']} | {row['rows']:,} | {row['size_mb']:.2f} |\n"

summary_md += f"""
## File Descriptions

### Raw Data Files (`1_datasets/raw/`)
- `base_observation_sample_100k.csv`: Balanced sample of 100K observations by status
- `base_observation_latest_50k.csv`: Latest observations from 2025
- `base_station_sample.csv`: All ground stations (full export)
- `base_satelliteentry_sample.csv`: Sample of satellite metadata
- `base_transmitterentry_sample.csv`: Sample of transmitter configurations
- `base_antennatype_full.csv`: Complete antenna type catalog
- ... and other relationship tables

### Metadata Files (`1_datasets/metadata/`)
- `dataset_timeline.csv`: Time range and counts
- `table_statistics.csv`: Row counts for all tables
- `data_dictionary.csv`: Column descriptions and sample values

## Usage Notes
1. All CSV files use UTF-8 encoding
2. Files are sampled for memory efficiency while maintaining distributions
3. For full analysis, use database connection with appropriate sampling
4. API endpoints available at:
   - https://network.satnogs.org/api/
   - https://db.satnogs.org/api/

## Known Issues
1. Some observation columns have high NULL percentages (see data dictionary)
2. Station location data needs joining with `base_station` table
3. Status codes need interpretation mapping
"""

with open("1_datasets/README.md", "w", encoding="utf-8") as f:
    f.write(summary_md)

print("üìÑ Summary report saved to: 1_datasets/README.md")


STEP 7: GENERATING SUMMARY REPORT
üìÑ Summary report saved to: 1_datasets/README.md


In [14]:
# ============================================================================
# COMPLETION
# ============================================================================
import time
start_time = time.time()

print("\n" + "="*70)
print("‚úÖ DATA EXPORT COMPLETE")
print("="*70)

print(f"\nüìÅ Files saved in:")
print(f"   - 1_datasets/raw/      (data files)")
print(f"   - 1_datasets/metadata/ (documentation)")

total_size_mb = sum(f['size_mb'] for f in exported_files)
print(f"\nüìä Total exported: {total_rows:,} rows ({total_size_mb:.2f} MB)")

print("\nüéØ Next steps:")
print("   1. Review 1_datasets/README.md")
print("   2. Move to 2_data_preparation/ for feature engineering")
print("   3. Use sampled data for EDA in 3_data_exploration/")

print(f"\n‚è±Ô∏è  Total execution time: {time.time() - start_time:.2f} seconds")


‚úÖ DATA EXPORT COMPLETE

üìÅ Files saved in:
   - 1_datasets/raw/      (data files)
   - 1_datasets/metadata/ (documentation)

üìä Total exported: 187,566 rows (211.31 MB)

üéØ Next steps:
   1. Review 1_datasets/README.md
   2. Move to 2_data_preparation/ for feature engineering
   3. Use sampled data for EDA in 3_data_exploration/

‚è±Ô∏è  Total execution time: 0.00 seconds
