<a href="https://colab.research.google.com/github/PETEROA/Anomaly/blob/main/Spatial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install geopandas pysal


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import geopandas as gpd
from pysal.explore import esda
from pysal.lib import weights
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('air_pollution_final.csv')

In [None]:
pip install splot

In [None]:
from splot.esda import moran_scatterplot, lisa_cluster
# Assuming you have a DataFrame 'df' with columns 'latitude', 'longitude', 'AQI_1', 'AQI_2', etc.
geometry = gpd.points_from_xy(df['longitude'], df['latitude'])
gdf = gpd.GeoDataFrame(df, geometry=geometry)

# Ensure you have a spatial weights matrix
w = weights.KNN.from_dataframe(gdf, k=5)

# List of variables to analyze
variables_to_analyze = ['AQI Value', 'Ozone AQI Value', 'PM2.5 AQI Value']  # Add more variables as needed

# Calculate Moran's I for each variable
for variable in variables_to_analyze:
    moran = esda.Moran(gdf[variable], w)

    # Plot Moran Scatterplot
    moran_scatterplot(moran, aspect_equal=True)
    plt.title(f'Moran Scatterplot for {variable}')
    plt.show()

    # Plot LISA cluster map
    lisa = esda.Moran_Local(gdf[variable], w)
    lisa_cluster(lisa, gdf, p=0.05, figsize=(9, 9))
    plt.title(f'LISA Cluster Map for {variable}')
    plt.show()


In [None]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# Assuming 'gdf' is your GeoDataFrame
coordinates = gdf[['latitude', 'longitude']]

# Standardize the features (latitude and longitude) for DBSCAN
scaler = StandardScaler()
coordinates_scaled = scaler.fit_transform(coordinates)

# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
gdf['cluster'] = dbscan.fit_predict(coordinates_scaled)

# Plot the clusters on a map
gdf.plot(column='cluster', categorical=True, legend=True, figsize=(12, 8), cmap='viridis', markersize=10)
plt.title('Spatial Clustering with DBSCAN')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Plot a histogram of distances
plt.hist(distances.max(axis=1), bins=50, edgecolor='black')
plt.xlabel('Distance to Nearest Neighbors')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Calculate the distance of each point to its nearest neighbors and consider points with unusually large distances as outliers.
from sklearn.neighbors import NearestNeighbors

# Define the threshold_distance
threshold_distance = 0.02 # Set your threshold value here

neighbors = NearestNeighbors(n_neighbors=10)
neighbors.fit(gdf[['latitude', 'longitude']])
distances, indices = neighbors.kneighbors(gdf[['latitude', 'longitude']])

# Consider points with large distances as outliers
gdf['max_distance_to_neighbors'] = distances.max(axis=1)
gdf['is_outlier_distance'] = gdf['max_distance_to_neighbors'] > threshold_distance


In [None]:
from sklearn.ensemble import IsolationForest
import pandas as pd
import numpy as np

# Assuming you have a DataFrame 'gdf' with 'latitude', 'longitude', 'is_outlier_distance'
# and 'max_distance_to_neighbors' columns

# Select relevant columns
features = gdf[['latitude', 'longitude', 'max_distance_to_neighbors']]

# Create an Isolation Forest model
model = IsolationForest(contamination=0.01)  # Adjust the contamination parameter

# Fit the model to the features
model.fit(features)

# Predict outliers
gdf['is_outlier_isolation_forest'] = model.predict(features)

# Convert predictions to binary (1 for outliers, -1 for inliers)
gdf['is_outlier_isolation_forest'] = np.where(gdf['is_outlier_isolation_forest'] == -1, 1, 0)

# Display or use the results as needed
print(gdf[['latitude', 'longitude', 'is_outlier_isolation_forest']])


In [None]:
from sklearn.neighbors import LocalOutlierFactor
import pandas as pd

# Assuming you have a DataFrame 'gdf' with 'latitude', 'longitude', and 'max_distance_to_neighbors' columns

# Select relevant columns
features = gdf[['latitude', 'longitude', 'max_distance_to_neighbors']]

# Create a Local Outlier Factor model
model = LocalOutlierFactor(contamination=0.05)  # Adjust the contamination parameter

# Fit the model to the features
model.fit(features)

# Predict outliers
gdf['lof_scores'] = model.negative_outlier_factor_

# Set a threshold for considering points as outliers
threshold_lof = -1.5  # Adjust based on your data and experimentation
gdf['is_outlier_lof'] = (gdf['lof_scores'] < threshold_lof).astype(int)

# Display or use the results as needed
print(gdf[['latitude', 'longitude', 'is_outlier_lof']])


In [None]:
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

# Assuming you have a DataFrame 'gdf' with 'latitude', 'longitude', and 'max_distance_to_neighbors' columns

# Select relevant columns
features = gdf[['latitude', 'longitude', 'max_distance_to_neighbors']]

# Create a Local Outlier Factor model
lof_model = LocalOutlierFactor(contamination=0.05)
lof_scores = -lof_model.fit_predict(features)

# Create an Isolation Forest model
if_model = IsolationForest(contamination=0.02)
if_scores = -if_model.fit_predict(features)

# Combine scores using average or weighted average
ensemble_scores = 0.1 * lof_scores + 0.1 * if_scores  # You can adjust the weights

# Set a threshold for considering points as outliers
threshold_ensemble = -1.5  # Adjust based on your data and experimentation
gdf['is_outlier_ensemble'] = (ensemble_scores < threshold_ensemble).astype(int)

# Display or use the results as needed
print(gdf[['latitude', 'longitude', 'is_outlier_ensemble']])


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming you have a column 'is_outlier_ensemble' in your DataFrame
y_true = gdf['AQI Value']  # Replace 'true_labels' with actual labels if you have them
y_pred = gdf['is_outlier_ensemble']

# Precision, Recall, and F1 Score with 'weighted' averaging
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
