In [42]:
import os
import numpy as np
import pandas as pd
import datetime
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
import geopandas as gpd


class WildlifeTrafficDataPipeline:
    def __init__(self, data_dir="data"):
        self.data_dir = data_dir
        os.makedirs(os.path.join(data_dir, "raw"), exist_ok=True)
        os.makedirs(os.path.join(data_dir, "processed"), exist_ok=True)
        os.makedirs(os.path.join(data_dir, "augmented"), exist_ok=True)
        self.scaler = MinMaxScaler()

    # ===================== 1. DATA GENERATION =====================

    def _generate_sample_gps_data(self, n_animals=10, days=30, readings_per_day=24):
        """Generate sample GPS tracking data for wildlife."""
        np.random.seed(42)
        data = []
        start_date = datetime.datetime(2023, 1, 1)

        # Create base locations for each animal
        animal_base_locations = {
            f"A{i}": {
                "lat": np.random.uniform(6.5, 37.1),
                "lon": np.random.uniform(68.7, 97.25)
            } for i in range(1, n_animals + 1)
        }

        for animal_id, base_loc in animal_base_locations.items():
            direction = 1  # 1 for north, -1 for south

            for day in range(days):
                if day == days // 2:
                    direction = -1

                for reading in range(readings_per_day):
                    timestamp = start_date + \
                        datetime.timedelta(days=day, hours=reading)

                    lat_noise = np.random.normal(0, 0.01)
                    lon_noise = np.random.normal(0, 0.01)

                    lat = base_loc["lat"] + lat_noise + \
                        (day * 0.01 * direction)
                    lon = base_loc["lon"] + lon_noise

                    speed = np.random.uniform(0, 5)
                    heading = np.random.uniform(0, 360)

                    data.append({
                        "animal_id": animal_id,
                        "species": np.random.choice(["deer", "elk", "bear", "wolf"]),
                        "timestamp": timestamp,
                        "latitude": lat,
                        "longitude": lon,
                        "speed": speed,
                        "heading": heading,
                        "battery_level": np.random.uniform(60, 100)
                    })

        df = pd.DataFrame(data)
        self.wildlife_data = df
        df.to_csv(os.path.join(self.data_dir, "raw",
                  "wildlife_data_raw.csv"), index=False)
        print(f"Generated wildlife data with shape: {df.shape}")

    def _generate_sample_traffic_data(self, n_segments=20, days=30, readings_per_day=24):
        """Generate sample traffic density and speed data."""
        np.random.seed(43)
        data = []
        start_date = datetime.datetime(2023, 1, 1)

        road_segments = {
            f"R{i}": {
                "start_lat": np.random.uniform(6.5, 37.1),
                "start_lon": np.random.uniform(68.7, 97.25),
                "end_lat": np.random.uniform(6.5, 37.1),
                "end_lon": np.random.uniform(68.7, 97.25),
                "road_type": np.random.choice(["highway", "primary", "secondary", "local"]),
                "speed_limit": np.random.choice([25, 35, 45, 55, 65, 75])
            } for i in range(1, n_segments + 1)
        }

        for segment_id, segment_info in road_segments.items():
            for day in range(days):
                for reading in range(readings_per_day):
                    timestamp = start_date + \
                        datetime.timedelta(days=day, hours=reading)

                    if 7 <= reading < 9 or 16 <= reading < 18:
                        traffic_factor = np.random.uniform(0.7, 1.0)
                    else:
                        traffic_factor = np.random.uniform(0.1, 0.6)

                    if timestamp.weekday() >= 5:
                        traffic_factor *= 0.7

                    vehicle_count = int(
                        100 * traffic_factor * (1 + np.random.normal(0, 0.1)))
                    speed = max(
                        segment_info['speed_limit'] * (1 - traffic_factor) * (1 + np.random.normal(0, 0.05)), 0)

                    data.append({
                        "road_segment_id": segment_id,
                        "timestamp": timestamp,
                        "start_lat": segment_info['start_lat'],
                        "start_lon": segment_info['start_lon'],
                        "end_lat": segment_info['end_lat'],
                        "end_lon": segment_info['end_lon'],
                        "road_type": segment_info['road_type'],
                        "speed_limit": segment_info['speed_limit'],
                        "vehicle_count": vehicle_count,
                        "speed": speed,
                        "weather_condition": np.random.choice(["clear", "rainy", "foggy", "snowy"])
                    })

        df = pd.DataFrame(data)
        self.traffic_data = df
        df.to_csv(os.path.join(self.data_dir, "raw",
                  "traffic_data_raw.csv"), index=False)
        print(f"Generated traffic data with shape: {df.shape}")

    # ===================== 2. DATA CLEANING =====================

    def clean_wildlife_data(self):
        if not hasattr(self, 'wildlife_data'):
            raise ValueError("Wildlife data must be generated first.")

        df = self.wildlife_data.copy()
        df.drop_duplicates(inplace=True)
        df = df[(df['latitude'] >= -90) & (df['latitude'] <= 90) &
                (df['longitude'] >= -180) & (df['longitude'] <= 180)]
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df.sort_values(['animal_id', 'timestamp'], inplace=True)

        self.wildlife_data_cleaned = df
        print(f"Cleaned wildlife data: {df.shape}")

    def clean_traffic_data(self):
        if not hasattr(self, 'traffic_data'):
            raise ValueError("Traffic data must be generated first.")

        df = self.traffic_data.copy()
        df.drop_duplicates(inplace=True)
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df.sort_values(['road_segment_id', 'timestamp'], inplace=True)

        self.traffic_data_cleaned = df
        print(f"Cleaned traffic data: {df.shape}")

    # ===================== 3. DATA AUGMENTATION =====================

    def augment_wildlife_data(self, num_synthetic_tracks=100):
        if not hasattr(self, 'wildlife_data_cleaned'):
            raise ValueError("Wildlife data must be cleaned first.")

        synthetic_data = []
        animal_ids = self.wildlife_data_cleaned['animal_id'].unique()
        for i in range(num_synthetic_tracks):
            base_animal_id = np.random.choice(animal_ids)
            base_animal_data = self.wildlife_data_cleaned[
                self.wildlife_data_cleaned['animal_id'] == base_animal_id
            ].copy()

            synthetic_track = base_animal_data.copy()
            synthetic_track['animal_id'] = f"synthetic_{base_animal_id}_{i}"
            synthetic_data.append(synthetic_track)

        # Assign to class attribute
        self.wildlife_data_augmented = pd.concat(synthetic_data, ignore_index=True)
        print(f"Augmented wildlife data: {self.wildlife_data_augmented.shape}")

    # ===================== 4. DATA MERGING =====================

    def merge_datasets(self):
        if not hasattr(self, 'wildlife_data_cleaned') or not hasattr(self, 'traffic_data_cleaned'):
            raise ValueError(
                "Both wildlife and traffic data must be cleaned first.")

        df = pd.merge(
            self.wildlife_data_cleaned,
            self.traffic_data_cleaned,
            on='timestamp',
            how='inner'
        )
        self.merged_data = df
        print(f"Merged dataset: {df.shape}")


# ===================== EXECUTION =====================
pipeline = WildlifeTrafficDataPipeline()
pipeline._generate_sample_gps_data()
pipeline._generate_sample_traffic_data()
pipeline.clean_wildlife_data()
pipeline.clean_traffic_data()
pipeline.augment_wildlife_data()
pipeline.merge_datasets()

Generated wildlife data with shape: (7200, 8)
Generated traffic data with shape: (14400, 11)
Cleaned wildlife data: (7200, 8)
Cleaned traffic data: (14400, 11)
Augmented wildlife data: (72000, 8)
Merged dataset: (144000, 18)


In [43]:
pipeline.wildlife_data.head()

Unnamed: 0,animal_id,species,timestamp,latitude,longitude,speed,heading,battery_level
0,A1,elk,2023-01-01 00:00:00,17.950799,95.846036,1.460723,131.890264,63.624257
1,A1,deer,2023-01-01 01:00:00,17.946164,95.857763,4.916154,168.034642,84.301794
2,A1,bear,2023-01-01 02:00:00,17.954925,95.852368,0.488361,246.32389,87.330541
3,A1,elk,2023-01-01 03:00:00,17.97223,95.846625,0.866823,140.781819,86.500891
4,A1,bear,2023-01-01 04:00:00,17.963016,95.823297,2.733551,66.547604,93.691391


In [44]:
pipeline.traffic_data.head()

Unnamed: 0,road_segment_id,timestamp,start_lat,start_lon,end_lat,end_lon,road_type,speed_limit,vehicle_count,speed,weather_condition
0,R1,2023-01-01 00:00:00,10.02067,86.08885,10.581764,75.568834,primary,55,17,47.812753,clear
1,R1,2023-01-01 01:00:00,10.02067,86.08885,10.581764,75.568834,primary,55,14,49.104932,snowy
2,R1,2023-01-01 02:00:00,10.02067,86.08885,10.581764,75.568834,primary,55,34,38.405693,snowy
3,R1,2023-01-01 03:00:00,10.02067,86.08885,10.581764,75.568834,primary,55,32,38.4107,rainy
4,R1,2023-01-01 04:00:00,10.02067,86.08885,10.581764,75.568834,primary,55,20,43.216453,rainy


In [45]:
pipeline.wildlife_data_augmented.head()

Unnamed: 0,animal_id,species,timestamp,latitude,longitude,speed,heading,battery_level
0,synthetic_A5_0,elk,2023-01-01 00:00:00,24.890581,88.916325,2.520742,289.524164,86.579079
1,synthetic_A5_0,bear,2023-01-01 01:00:00,24.872263,88.912811,2.454625,68.526802,66.799852
2,synthetic_A5_0,bear,2023-01-01 02:00:00,24.885974,88.909054,0.069967,263.652444,87.775418
3,synthetic_A5_0,wolf,2023-01-01 03:00:00,24.900889,88.905152,2.981883,131.804273,87.893031
4,synthetic_A5_0,deer,2023-01-01 04:00:00,24.8856,88.918434,3.905216,259.443489,78.011179


In [46]:
pipeline.merged_data.tail()

Unnamed: 0,animal_id,species,timestamp,latitude,longitude,speed_x,heading,battery_level,road_segment_id,start_lat,start_lon,end_lat,end_lon,road_type,speed_limit,vehicle_count,speed_y,weather_condition
143995,A9,bear,2023-01-30 23:00:00,15.510909,83.68537,4.863759,99.008152,97.882369,R5,32.477538,96.435328,18.292534,95.950636,primary,75,44,38.572463,foggy
143996,A9,bear,2023-01-30 23:00:00,15.510909,83.68537,4.863759,99.008152,97.882369,R6,10.774585,75.273332,12.532723,91.496952,local,35,19,29.158058,snowy
143997,A9,bear,2023-01-30 23:00:00,15.510909,83.68537,4.863759,99.008152,97.882369,R7,19.385536,81.399651,31.737891,73.343781,highway,45,16,36.611377,snowy
143998,A9,bear,2023-01-30 23:00:00,15.510909,83.68537,4.863759,99.008152,97.882369,R8,33.253666,96.205768,30.830559,72.081811,secondary,45,24,36.301671,clear
143999,A9,bear,2023-01-30 23:00:00,15.510909,83.68537,4.863759,99.008152,97.882369,R9,19.105761,93.396239,27.068625,86.651834,secondary,75,64,30.995366,foggy


In [47]:
import requests
import pandas as pd

# Base URL for the GBIF Occurrence API
url = "https://api.gbif.org/v1/occurrence/search"

# Query parameters
params = {
    "country": "IN",  # India country code
    "hasCoordinate": "true",  # Ensure it has location data
    "eventDate": "2024-01-01,2025-12-31",  # Fetch data from 2024 to 2025
    "taxonKey": "359",  # Example: Taxon key for mammals
    "limit": 50  # Limit number of records per request
}

# Make the request
response = requests.get(url, params=params)

# Check the response
if response.status_code == 200:
    data = response.json()["results"]
    df = pd.DataFrame(data)
    print(df[["scientificName", "decimalLatitude", "decimalLongitude", "eventDate"]])
else:
    print(f"Error: {response.status_code}, {response.text}")

                                     scientificName  decimalLatitude  \
0            Boselaphus tragocamelus (Pallas, 1766)        25.089458   
1                  Canis lupus pallipes Sykes, 1831        25.127978   
2              Macaca assamensis (McClelland, 1840)        27.468769   
3         Funambulus tristriatus (Waterhouse, 1837)        19.193394   
4         Funambulus tristriatus (Waterhouse, 1837)        10.527642   
5                        Felis catus Linnaeus, 1758        10.527642   
6                    Felis chaus affinis Gray, 1830        25.090233   
7               Pteropus giganteus (Brünnich, 1782)         9.988740   
8               Pteropus giganteus (Brünnich, 1782)        10.527642   
9               Pteropus giganteus (Brünnich, 1782)        10.527642   
10                 Panthera tigris (Linnaeus, 1758)        12.131064   
11             Funambulus palmarum (Linnaeus, 1766)        12.947871   
12                      Canis aureus Linnaeus, 1758        13.16

In [2]:
import requests
import pandas as pd

# GBIF API URL
GBIF_URL = "https://api.gbif.org/v1/occurrence/search"


def fetch_gbif_data(limit=100000):
    params = {
        # Example: Mammals (you can modify this based on the species)
        'taxonKey': 359,
        'hasCoordinate': True,  # Only records with coordinates
        'country': 'IN',  # India
        'limit': limit
    }

    response = requests.get(GBIF_URL, params=params)
    if response.status_code == 200:
        data = response.json().get('results', [])
        df = pd.json_normalize(data)
        if not df.empty:
            return df[['scientificName', 'decimalLatitude', 'decimalLongitude', 'eventDate', 'country', 'basisOfRecord']]
        else:
            print("No data found.")
            return pd.DataFrame()
    else:
        print(f"Error fetching GBIF data: {response.status_code}")
        print(response.text)
        return pd.DataFrame()


# Fetch GBIF data
gbif_data = fetch_gbif_data(limit=100000)
print(gbif_data.head())
# Save GBIF data to a CSV file
if not gbif_data.empty:
    gbif_data.to_csv('gbif_data.csv', index=False)
    print("Data saved to 'gbif_data.csv'")


                              scientificName  decimalLatitude  \
0     Boselaphus tragocamelus (Pallas, 1766)        25.089458   
1           Canis lupus pallipes Sykes, 1831        25.127978   
2       Macaca assamensis (McClelland, 1840)        27.468769   
3  Funambulus tristriatus (Waterhouse, 1837)        19.193394   
4  Funambulus tristriatus (Waterhouse, 1837)        10.527642   

   decimalLongitude         eventDate country      basisOfRecord  
0         81.702629  2025-01-01T18:15   India  HUMAN_OBSERVATION  
1         81.798611  2025-01-01T18:31   India  HUMAN_OBSERVATION  
2         96.419196  2025-01-01T11:23   India  HUMAN_OBSERVATION  
3         73.018777  2025-01-01T08:40   India  HUMAN_OBSERVATION  
4         76.214435  2025-01-01T15:47   India  HUMAN_OBSERVATION  
Data saved to 'gbif_data.csv'


In [51]:
import requests
import pandas as pd
import time
import os

# GBIF API URL
GBIF_URL = "https://api.gbif.org/v1/occurrence/search"
CSV_FILE = 'gbif_data.csv'
POLL_INTERVAL = 30  # Poll every 30 seconds (adjust as needed)


def fetch_gbif_data(limit=100, last_event_date=None):
    params = {
        'taxonKey': 359,  # Mammals
        'hasCoordinate': True,
        'country': 'IN',
        'limit': limit
    }
    if last_event_date:
        params['eventDate'] = f'>{last_event_date}'

    response = requests.get(GBIF_URL, params=params)
    if response.status_code == 200:
        data = response.json().get('results', [])
        if data:
            df = pd.json_normalize(data)
            new_records = df[['scientificName', 'decimalLatitude',
                              'decimalLongitude', 'eventDate', 'country', 'basisOfRecord']]
            return new_records
    else:
        print(f"Error fetching GBIF data: {response.status_code}")
        print(response.text)
    return pd.DataFrame()


def get_last_event_date():
    if os.path.exists(CSV_FILE):
        df = pd.read_csv(CSV_FILE)
        if not df.empty:
            return df['eventDate'].max()
    return None


def update_csv():
    last_event_date = get_last_event_date()
    new_data = fetch_gbif_data(limit=100, last_event_date=last_event_date)

    if not new_data.empty:
        if os.path.exists(CSV_FILE):
            new_data.to_csv(CSV_FILE, mode='a', index=False,
                            header=False)  # Append without header
        else:
            new_data.to_csv(CSV_FILE, index=False)
        print(f"Added {len(new_data)} new records to {CSV_FILE}")
    else:
        print("No new data found.")


# Continuous polling loop
try:
    while True:
        update_csv()
        time.sleep(POLL_INTERVAL)  # Wait before fetching again
except KeyboardInterrupt:
    print("\nStopped fetching data.")

Added 100 new records to gbif_data.csv
Error fetching GBIF data: 400
>2025-01-12T15:07:58 is not a valid date
No new data found.
Error fetching GBIF data: 400
>2025-01-12T15:07:58 is not a valid date
No new data found.
Error fetching GBIF data: 400
>2025-01-12T15:07:58 is not a valid date
No new data found.
Error fetching GBIF data: 400
>2025-01-12T15:07:58 is not a valid date
No new data found.
Error fetching GBIF data: 400
>2025-01-12T15:07:58 is not a valid date
No new data found.
Error fetching GBIF data: 400
>2025-01-12T15:07:58 is not a valid date
No new data found.
Error fetching GBIF data: 400
>2025-01-12T15:07:58 is not a valid date
No new data found.
Error fetching GBIF data: 400
>2025-01-12T15:07:58 is not a valid date
No new data found.
Error fetching GBIF data: 400
>2025-01-12T15:07:58 is not a valid date
No new data found.
Error fetching GBIF data: 400
>2025-01-12T15:07:58 is not a valid date
No new data found.
Error fetching GBIF data: 400
>2025-01-12T15:07:58 is not a v

ConnectionError: HTTPSConnectionPool(host='api.gbif.org', port=443): Max retries exceeded with url: /v1/occurrence/search?taxonKey=359&hasCoordinate=True&country=IN&limit=100&eventDate=%3E2025-01-12T15%3A07%3A58 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000025FD6262D50>: Failed to resolve 'api.gbif.org' ([Errno 11001] getaddrinfo failed)"))

In [4]:
import requests
import pandas as pd
import time

# GBIF API URL
GBIF_URL = "https://api.gbif.org/v1/occurrence/search"


def fetch_gbif_data(limit=100000, batch_size=300):
    all_data = []
    offset = 0

    while len(all_data) < limit:
        params = {
            'taxonKey': 359,  # Example: Mammals
            'hasCoordinate': True,
            'country': 'IN',  # India
            'limit': batch_size,
            'offset': offset
        }

        response = requests.get(GBIF_URL, params=params)
        if response.status_code == 200:
            data = response.json().get('results', [])
            if not data:
                break
            all_data.extend(data)
            offset += batch_size
            print(f"Fetched {len(all_data)} records...")
            time.sleep(1)  # To avoid hitting the API rate limit
        else:
            print(f"Error fetching GBIF data: {response.status_code}")
            print(response.text)
            break

    df = pd.json_normalize(all_data)
    if not df.empty:
        return df[['scientificName', 'decimalLatitude', 'decimalLongitude', 'eventDate', 'country', 'basisOfRecord']]
    else:
        return pd.DataFrame()


# Fetch GBIF data
gbif_data = fetch_gbif_data(limit=100000)

# Save GBIF data to a CSV file
if not gbif_data.empty:
    gbif_data.to_csv('gbif_data.csv', index=False)
    print(f"Data saved to 'gbif_data.csv' with {len(gbif_data)} records.")
else:
    print("No data to save.")

Fetched 300 records...
Fetched 600 records...
Fetched 900 records...
Fetched 1200 records...
Fetched 1500 records...
Fetched 1800 records...
Fetched 2100 records...
Fetched 2400 records...
Fetched 2700 records...
Fetched 3000 records...
Fetched 3300 records...
Fetched 3600 records...
Fetched 3900 records...
Fetched 4200 records...
Fetched 4500 records...
Fetched 4800 records...
Fetched 5100 records...
Fetched 5400 records...
Fetched 5700 records...
Fetched 6000 records...
Fetched 6300 records...
Fetched 6600 records...
Fetched 6900 records...
Fetched 7200 records...
Fetched 7500 records...
Fetched 7800 records...
Fetched 8100 records...
Fetched 8400 records...
Fetched 8700 records...
Fetched 9000 records...
Fetched 9300 records...
Fetched 9600 records...
Fetched 9900 records...
Fetched 10200 records...
Fetched 10500 records...
Fetched 10800 records...
Fetched 11100 records...
Fetched 11400 records...
Fetched 11700 records...
Fetched 12000 records...
Fetched 12300 records...
Fetched 126