In [None]:
import sys
import os
import numpy as np
from pathlib import Path
notebook_path = os.getcwd()  # Gets current working directory
project_root = os.path.abspath(os.path.join(notebook_path, '..'))
sys.path.insert(0, project_root)

data_dir = os.path.join(project_root, 'data')
raw_dir = os.path.join(data_dir, 'raw')
Path(raw_dir).mkdir(parents=True, exist_ok=True)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import cartopy.crs as ccrs
import cartopy.feature as cfeature
from helper import get_dataset_with_copy

### File reading and copy variable declaration
We create two versions of our dataframe: main and copy
The copy preserves our original, untouched data as a backup reference, while the main dataframe will be used for active analysis and transformations.
This is particularly useful in Jupyter notebooks where we can always refer back to the original state of our data without reloading the file or having to restart the notebook to run it all again

In [None]:
jaguar_data_original, jaguar_data = get_dataset_with_copy(os.path.join(raw_dir, 'jaguar_movement_data.csv'))

jaguar_info_original, jaguar_info = get_dataset_with_copy(os.path.join(raw_dir, 'jaguar_additional_information.csv'))

#### Initial Data Exploration
Display basic information about both datasets including their structure and dimensions

In [None]:
print(jaguar_data_original.head())
print(jaguar_data_original.shape)
print(jaguar_info_original.head())
print(jaguar_info_original.shape)

Check detailed information about data types and null values in both datasets

In [None]:
print(jaguar_data_original.info())
print(jaguar_info_original.info())

In [None]:
jaguar_data_original.describe().T

#### Data Quality Check
Check for missing values in both datasets

In [None]:
# Creating summary for jaguar_data_original
df_summary_data = pd.DataFrame({
    "nulls": jaguar_data_original.isnull().sum(),
    "uniques": jaguar_data_original.nunique()
})

print(df_summary_data)
# Creating summary for jaguar_info_original
df_summary_info = pd.DataFrame({
    "nulls": jaguar_info_original.isnull().sum(),
    "uniques": jaguar_info_original.nunique()
})

print(df_summary_info)

We can already see that:
- In the jaguar movement dataset we the column individual.taxon.canonical.name has a unique value so we will then remove from the dataset
- The only columns with null values we can find are the columns Estimated Age and Weight in the jaguar info dataset.

In [None]:
jaguar_data = jaguar_data.drop(columns=['individual.taxon.canonical.name'])
jaguar_info["Estimated Age"]= jaguar_info["Estimated Age"].fillna(value=0)
jaguar_info["Weight"]= jaguar_info["Weight"].fillna(value=0)

Lets start by showing an histogram of every numeric column and then making pie charts for every non-numeric column
For this we will be ignoring identifiers or timestamps

In [None]:
# Histogram to check for single value columns to remove
jaguar_data.hist(figsize=(20,20))
plt.show()

# Pie Chart Distrubtion for non numeric cols
for col in jaguar_data.select_dtypes(include='object').columns:
    if col !='timestamp' and col !='tag.local.identifier':
        counts = jaguar_data[col].value_counts()
        print(col, counts)
        plt.figure(figsize=(8, 6), facecolor='white')
        plt.pie(counts, labels=counts.index, autopct='%1.1f%%', startangle=90)
        plt.title(f"{col} Distribution")  # f-string for string formatting
        plt.show()

In [None]:
jaguar_info_original.describe().T

In [None]:
# Histogram to check for single value columns to remove
jaguar_info.hist(figsize=(20,20))
plt.show()

# Pie Chart Distrubtion for non numeric cols
for col in jaguar_info.select_dtypes(include='object').columns:
        counts = jaguar_info[col].value_counts()
        plt.figure(figsize=(8, 6), facecolor='white')
        plt.pie(counts, labels=counts.index, autopct='%1.1f%%', startangle=90)
        plt.title(f"{col} Distribution")  # f-string for string formatting
        plt.show()

In [None]:
X = jaguar_data.drop(columns=['timestamp', 'tag.local.identifier'])
categorical_cols = X.select_dtypes(include=[object]).columns.tolist()

print(categorical_cols)

corr = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
print(corr)
corr = corr.astype(int)
print(corr)

correlation_matrix = corr.corr()
plt.figure(figsize=(30, 20))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, square=True, 
            cbar_kws={"shrink": .75}, linewidths=.5)
plt.title('Correlation Matrix of Features', fontsize=16)
plt.tight_layout()
plt.show()

TEMPORAL ANALYSIS

In [None]:
jaguar_data['timestamp'] = pd.to_datetime(jaguar_data['timestamp'], errors='coerce')

# Check if conversion was successful
print(jaguar_data['timestamp'].dtype)

# Remove any rows with invalid timestamps if needed
jaguar_data = jaguar_data.dropna(subset=['timestamp'])

# Proceed with time decomposition
jaguar_data['year'] = jaguar_data['timestamp'].dt.year
jaguar_data['month'] = jaguar_data['timestamp'].dt.month
jaguar_data['day'] = jaguar_data['timestamp'].dt.day
jaguar_data['hour'] = jaguar_data['timestamp'].dt.hour
jaguar_data['day_of_week'] = jaguar_data['timestamp'].dt.dayofweek

# Visualization Subplots
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
fig.suptitle('Temporal Distribution of Jaguar Movements', fontsize=16)

# Observations by Year
yearly_counts = jaguar_data.groupby('year').size()
yearly_counts.plot(kind='bar', ax=axes[0,0], title='Observations per Year')
axes[0,0].set_xlabel('Year')
axes[0,0].set_ylabel('Number of Observations')

# Observations by Month
monthly_counts = jaguar_data.groupby('month').size()
monthly_counts.plot(kind='bar', ax=axes[0,1], title='Observations per Month')
axes[0,1].set_xlabel('Month')
axes[0,1].set_ylabel('Number of Observations')

# Observations by Day of Week
day_of_week_counts = jaguar_data.groupby('day_of_week').size()
day_of_week_counts.plot(kind='bar', ax=axes[0,2], title='Observations by Day of Week')
axes[0,2].set_xlabel('Day of Week (0=Monday, 6=Sunday)')
axes[0,2].set_ylabel('Number of Observations')

# Hourly Movement Distribution
hourly_counts = jaguar_data.groupby('hour').size()
hourly_counts.plot(kind='line', marker='o', ax=axes[1,0], title='Hourly Movement Distribution')
axes[1,0].set_xlabel('Hour of Day')
axes[1,0].set_ylabel('Number of Observations')

# Time between observations for each jaguar
def time_between_observations(group):
    return group['timestamp'].diff().mean()

observation_intervals = jaguar_data.groupby('individual.local.identifier (ID)').apply(time_between_observations)
observation_intervals.plot(kind='bar', ax=axes[1,1], title='Average Time Between Observations')
axes[1,1].set_xlabel('Jaguar ID')
axes[1,1].set_ylabel('Average Time Interval')

# Cumulative observations over time
cumulative_obs = jaguar_data.groupby('timestamp').size().cumsum()
cumulative_obs.plot(ax=axes[1,2], title='Cumulative Observations Over Time')
axes[1,2].set_xlabel('Date')
axes[1,2].set_ylabel('Cumulative Observations')

plt.tight_layout()
plt.show()

# Additional Temporal Insights
print("\nTemporal Analysis Insights:")
print(f"Total Observation Period: {jaguar_data['timestamp'].min()} to {jaguar_data['timestamp'].max()}")
print(f"Total Observations: {len(jaguar_data)}")
print(f"Number of Individual Jaguars: {jaguar_data['individual.local.identifier (ID)'].nunique()}")
    

GEOSPACIAL ANALYSIS

In [None]:
unique_jaguars = sorted(jaguar_data['individual.local.identifier (ID)'].unique())
# Print spatial statistics
print("\nGeospatial Analysis Insights:")

# Calculate statistics for all jaguars
longitude_stats = jaguar_data['location.long'].describe()
latitude_stats = jaguar_data['location.lat'].describe()

print("\nOverall Longitude Statistics:")
print(longitude_stats)
print("\nOverall Latitude Statistics:")
print(latitude_stats)

# Create density heatmap
plt.figure(figsize=(15, 10))
density = plt.hexbin(jaguar_data['location.long'], 
                    jaguar_data['location.lat'], 
                    gridsize=30, 
                    cmap='YlOrRd')
plt.colorbar(density, label='Observation Density')
plt.title('Jaguar Movement Density Heatmap')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Print movement area for each jaguar
print("\nIndividual Jaguar Movement Areas:")
for jaguar_id in unique_jaguars:
    jaguar_subset = jaguar_data[jaguar_data['individual.local.identifier (ID)'] == jaguar_id]
    min_long = jaguar_subset['location.long'].min()
    max_long = jaguar_subset['location.long'].max()
    min_lat = jaguar_subset['location.lat'].min()
    max_lat = jaguar_subset['location.lat'].max()
    
    print(f"\nJaguar {jaguar_id}:")
    print(f"Longitude Range: {min_long:.4f}° to {max_long:.4f}°")
    print(f"Latitude Range: {min_lat:.4f}° to {max_lat:.4f}°")

Check for number of unique date latitude longitude groups

In [None]:
group_copy = jaguar_data_original.copy()
group_copy['timestamp'] = pd.to_datetime(group_copy['timestamp'])
    
# Convert timestamp to date only (removing time component)
group_copy['date'] = group_copy['timestamp'].dt.date

# Create location groups
location_groups = group_copy.groupby(['date', 'location.lat', 'location.long']).size().reset_index(name='count')

# Calculate summary statistics
stats = {
    'total_observations': len(group_copy),
    'unique_locations': len(location_groups),
    'unique_dates': group_copy['date'].nunique(),
    'avg_locations_per_day': len(location_groups) / group_copy['date'].nunique(),
    'locations_by_date': location_groups.groupby('date').size().describe()
}

# Print results
print("\nLocation Analysis Summary:")
print(f"Total observations: {stats['total_observations']:,}")
print(f"Unique date-location combinations: {stats['unique_locations']:,}")
print(f"Number of unique dates: {stats['unique_dates']:,}")
print(f"Average unique locations per day: {stats['avg_locations_per_day']:.2f}")
print("\nDaily unique locations statistics:")
print(stats['locations_by_date'])