In [None]:
# Install the Kaggle API library
pip install kaggle

In [None]:
# Upload your Kaggle API token (kaggle.json)
from google.colab import files
files.upload()

In [None]:
# Create a hidden .kaggle directory if it doesn't exist
!mkdir ~/.kaggle

# Move the uploaded kaggle.json file to the .kaggle directory
!cp kaggle.json ~/.kaggle/

# Set permissions
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Download the US Accidents dataset from Kaggle
!kaggle datasets download -d sobhanmoosavi/us-accidents

In [None]:
# Unzip the downloaded dataset
!unzip us-accidents.zip

In [None]:
# Import cuDF library for GPU-accelerated data processing
import cudf

df = cudf.read_csv("/content/US_Accidents_March23.csv")
df.info()

In [None]:
# Drop rows with missing coordinates
df = df.dropna(subset=['Start_Lat', 'Start_Lng'])

# Keep only valid lat and lon values
df = df[(df['Start_Lat'].between(-90, 90)) & (df['Start_Lng'].between(-180, 180))]

# Display summary statistics
df[['Start_Lat', 'Start_Lng']].describe()

In [None]:
# Calculate mean and standard deviation for lat and lon
lat_mean = df['Start_Lat'].mean()
lat_std = df['Start_Lat'].std()
lng_mean = df['Start_Lng'].mean()
lng_std = df['Start_Lng'].std()

# Identify outliers
outliers = df[
    (abs(df['Start_Lat'] - lat_mean) > 3 * lat_std) |
    (abs(df['Start_Lng'] - lng_mean) > 3 * lng_std)
]

# Show first few outliers
outliers[['Start_Lat', 'Start_Lng']].head()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))

# Convert to pandas Series before plotting
df_lat_pandas = df['Start_Lat'].to_pandas() if isinstance(df, cudf.DataFrame) else df['Start_Lat']
df_lng_pandas = df['Start_Lng'].to_pandas() if isinstance(df, cudf.DataFrame) else df['Start_Lng']
outliers_lat_pandas = outliers['Start_Lat'].to_pandas() if isinstance(outliers, cudf.DataFrame) else outliers['Start_Lat']
outliers_lng_pandas = outliers['Start_Lng'].to_pandas() if isinstance(outliers, cudf.DataFrame) else outliers['Start_Lng']


# Plot normal points
plt.scatter(df_lng_pandas, df_lat_pandas, s=1, alpha=0.1, label="Normal Points")

# Plot outliers
plt.scatter(outliers_lng_pandas, outliers_lat_pandas, s=10, color='red', label="Outliers")

plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.title("US Accidents Outlier Detection (Std Dev Method)")
plt.legend()
plt.show()