In [3]:
import os
import numpy as np
import pandas as pd
import datetime
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
import geopandas as gpd


class WildlifeTrafficDataPipeline:
    def __init__(self, data_dir="data"):
        self.data_dir = data_dir
        os.makedirs(os.path.join(data_dir, "raw"), exist_ok=True)
        os.makedirs(os.path.join(data_dir, "processed"), exist_ok=True)
        os.makedirs(os.path.join(data_dir, "augmented"), exist_ok=True)
        self.scaler = MinMaxScaler()

    # ===================== 1. DATA GENERATION =====================

    def _generate_sample_gps_data(self, n_animals=10, days=30, readings_per_day=24):
        """Generate sample GPS tracking data for wildlife."""
        np.random.seed(42)
        data = []
        start_date = datetime.datetime(2023, 1, 1)

        # Create base locations for each animal
        animal_base_locations = {
            f"A{i}": {
                "lat": np.random.uniform(30, 45),
                "lon": np.random.uniform(-120, -100)
            } for i in range(1, n_animals + 1)
        }

        for animal_id, base_loc in animal_base_locations.items():
            direction = 1  # 1 for north, -1 for south

            for day in range(days):
                if day == days // 2:
                    direction = -1

                for reading in range(readings_per_day):
                    timestamp = start_date + \
                        datetime.timedelta(days=day, hours=reading)

                    lat_noise = np.random.normal(0, 0.01)
                    lon_noise = np.random.normal(0, 0.01)

                    lat = base_loc["lat"] + lat_noise + \
                        (day * 0.01 * direction)
                    lon = base_loc["lon"] + lon_noise

                    speed = np.random.uniform(0, 5)
                    heading = np.random.uniform(0, 360)

                    data.append({
                        "animal_id": animal_id,
                        "species": np.random.choice(["deer", "elk", "bear", "wolf"]),
                        "timestamp": timestamp,
                        "latitude": lat,
                        "longitude": lon,
                        "speed": speed,
                        "heading": heading,
                        "battery_level": np.random.uniform(60, 100)
                    })

        df = pd.DataFrame(data)
        self.wildlife_data = df
        df.to_csv(os.path.join(self.data_dir, "raw",
                  "wildlife_data_raw.csv"), index=False)
        print(f"Generated wildlife data with shape: {df.shape}")

    def _generate_sample_traffic_data(self, n_segments=20, days=30, readings_per_day=24):
        """Generate sample traffic density and speed data."""
        np.random.seed(43)
        data = []
        start_date = datetime.datetime(2023, 1, 1)

        road_segments = {
            f"R{i}": {
                "start_lat": np.random.uniform(30, 45),
                "start_lon": np.random.uniform(-120, -100),
                "end_lat": np.random.uniform(30, 45),
                "end_lon": np.random.uniform(-120, -100),
                "road_type": np.random.choice(["highway", "primary", "secondary", "local"]),
                "speed_limit": np.random.choice([25, 35, 45, 55, 65, 75])
            } for i in range(1, n_segments + 1)
        }

        for segment_id, segment_info in road_segments.items():
            for day in range(days):
                for reading in range(readings_per_day):
                    timestamp = start_date + \
                        datetime.timedelta(days=day, hours=reading)

                    if 7 <= reading < 9 or 16 <= reading < 18:
                        traffic_factor = np.random.uniform(0.7, 1.0)
                    else:
                        traffic_factor = np.random.uniform(0.1, 0.6)

                    if timestamp.weekday() >= 5:
                        traffic_factor *= 0.7

                    vehicle_count = int(
                        100 * traffic_factor * (1 + np.random.normal(0, 0.1)))
                    speed = max(
                        segment_info['speed_limit'] * (1 - traffic_factor) * (1 + np.random.normal(0, 0.05)), 0)

                    data.append({
                        "road_segment_id": segment_id,
                        "timestamp": timestamp,
                        "start_lat": segment_info['start_lat'],
                        "start_lon": segment_info['start_lon'],
                        "end_lat": segment_info['end_lat'],
                        "end_lon": segment_info['end_lon'],
                        "road_type": segment_info['road_type'],
                        "speed_limit": segment_info['speed_limit'],
                        "vehicle_count": vehicle_count,
                        "speed": speed,
                        "weather_condition": np.random.choice(["clear", "rainy", "foggy", "snowy"])
                    })

        df = pd.DataFrame(data)
        self.traffic_data = df
        df.to_csv(os.path.join(self.data_dir, "raw",
                  "traffic_data_raw.csv"), index=False)
        print(f"Generated traffic data with shape: {df.shape}")

In [None]:
    def clean_wildlife_data(self):
        if not hasattr(self, 'wildlife_data'):
            raise ValueError("Wildlife data must be generated first.")
        
        df = self.wildlife_data.copy()
        df.drop_duplicates(inplace=True)
        df = df[(df['latitude'] >= -90) & (df['latitude'] <= 90) & 
                (df['longitude'] >= -180) & (df['longitude'] <= 180)]
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df.sort_values(['animal_id', 'timestamp'], inplace=True)
        
        self.wildlife_data_cleaned = df
        print(f"Cleaned wildlife data: {df.shape}")

    def clean_traffic_data(self):
        if not hasattr(self, 'traffic_data'):
            raise ValueError("Traffic data must be generated first.")
        
        df = self.traffic_data.copy()
        df.drop_duplicates(inplace=True)
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df.sort_values(['road_segment_id', 'timestamp'], inplace=True)
        
        self.traffic_data_cleaned = df
        print(f"Cleaned traffic data: {df.shape}")


In [5]:
def augment_wildlife_data(self, num_synthetic_tracks=100):
    if not hasattr(self, 'wildlife_data_cleaned'):
        raise ValueError("Wildlife data must be cleaned first.")

    synthetic_data = []
    animal_ids = self.wildlife_data_cleaned['animal_id'].unique()
    for i in range(num_synthetic_tracks):
        base_animal_id = np.random.choice(animal_ids)
        base_animal_data = self.wildlife_data_cleaned[
            self.wildlife_data_cleaned['animal_id'] == base_animal_id
        ].copy()

        synthetic_track = base_animal_data.copy()
        synthetic_track['animal_id'] = f"synthetic_{base_animal_id}_{i}"
        synthetic_data.append(synthetic_track)

    df = pd.concat(synthetic_data, ignore_index=True)
    print(f"Augmented wildlife data: {df.shape}")

In [6]:
def merge_datasets(self):
    if not hasattr(self, 'wildlife_data_cleaned') or not hasattr(self, 'traffic_data_cleaned'):
        raise ValueError(
                "Both wildlife and traffic data must be cleaned first.")

    df = pd.merge(
        self.wildlife_data_cleaned,
        self.traffic_data_cleaned,
        on='timestamp',
        how='inner'
    )
    print(f"Merged dataset: {df.shape}")

In [7]:
pipeline = WildlifeTrafficDataPipeline()
pipeline._generate_sample_gps_data()
pipeline._generate_sample_traffic_data()
pipeline.clean_wildlife_data()
pipeline.clean_traffic_data()
pipeline.augment_wildlife_data()
pipeline.merge_datasets()

Generated wildlife data with shape: (7200, 8)
Generated traffic data with shape: (14400, 11)


AttributeError: 'WildlifeTrafficDataPipeline' object has no attribute 'clean_wildlife_data'