In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
import folium
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

distribution = pd.read_csv('wcvp_distribution.csv')
names = pd.read_csv('wcvp_names.csv', low_memory=False)

In [None]:
distribution.isnull().sum()

In [None]:
columns = ['region_code_l2', 'region', 'area_code_l3', 'area']
distribution_cleaned = distribution.dropna(subset=columns)
print(distribution_cleaned.isnull().sum())

In [None]:
unique_region = distribution_cleaned['region'].unique()

In [None]:
# Initializing geocoder
geolocator = Nominatim(user_agent="geoproject")


region_df = pd.DataFrame(columns=['location', 'latitude', 'longitude'])

# List to track failed geocoding attempts
failed_locations = []


def do_geocode(location):
    try:
        return geolocator.geocode(location, timeout=10)
    except GeocoderTimedOut:
        return do_geocode(location)


for location in unique_region:
    loc = do_geocode(location)
    if loc:
        region_df = pd.concat([region_df, pd.DataFrame([{'location': location, 'latitude': loc.latitude, 'longitude': loc.longitude}])], ignore_index=True)
    else:
        failed_locations.append(location)

print(region_df)


print("Failed to geocode the following locations:")
print(failed_locations)

In [None]:
manual_geocodes = {
    "Northern South America": (4.5709, -74.2973),
    "Papuasia": (-6.314993, 143.95555),
    "Southwestern Pacific": (-18.1425999, 178.4419499),
    "West Tropical Africa": (7.3697, -5.5353),
    "West-Central Tropical Africa": (-0.2280, 15.8277),
    "Northeast Tropical Africa": (12.6500, 39.6333),
    "East Tropical Africa": (-6.3690, 34.8888),
    "Middle Atlantic Ocean": (-14.5994, -28.6731),
    "Western Indian Ocean": (-12.5, 43.4),
    "Russian Far East": (60.0000, 105.0000),
    "Subarctic America": (58.3019, -134.4197),
    "Subantarctic Islands": (-54.4296, -36.5879),
    "Antarctic Continent": (-82.8628, 135.0000)
}

# Appendding manually geocoded locations to the DataFrame
for location, (lat, lon) in manual_geocodes.items():
    region_df = pd.concat([region_df, pd.DataFrame([{'location': location, 'latitude': lat, 'longitude': lon}])], ignore_index=True)

print("Final DataFrame with manually geocoded locations:")
print(region_df)

In [None]:
# Merging the DataFrames on the 'Region' and 'Location' columns
new_distribution = pd.merge(distribution_cleaned, region_df, left_on='region', right_on='location', how='left')

# Dropping the redundant 'Location' column if necessary
new_distribution.drop(columns=['location'], inplace=True)


In [None]:
final_dataset = pd.merge(new_distribution, names, on='plant_name_id', how='left')

# dropping redundant or unwanted columns
final_dataset.drop(columns=['infraspecific_rank', 'infraspecies', 'nomenclatural_remarks', 'replaced_synonym_author', 'homotypic_synonym', 'hybrid_formula', 'basionym_plant_name_id', 'parenthetical_author', 'parent_plant_name_id'], inplace=True)



In [None]:
final_dataset.isnull().sum()

In [None]:
final_dataset.describe(include='object')

In [None]:
to_dropna = ['family', 'genus', 'species']
final_dataset_cleaned = final_dataset.dropna(subset=to_dropna)

In [None]:
final_dataset_cleaned.isnull().sum()

## Visualizing Species density with Heat map

In [None]:
# Removing duplicates to ensure each species-location pair is unique
unique_species_data = final_dataset_cleaned.drop_duplicates(subset=['species', 'latitude', 'longitude'])
print(unique_species_data.shape)

In [None]:
from folium.plugins import HeatMap

# Creating a base map
m = folium.Map(location=[0, 0], zoom_start=2)

# Adding heatmap with unique species
heat_data = [[row['latitude'], row['longitude']] for index, row in unique_species_data.iterrows()]
HeatMap(heat_data).add_to(m)

# Saving the heatmap to an HTML file
m.save('unique_species_density_heatmap.html')


m

## Plotting Unique Species Count per Continent

In [None]:
# Counting unique species per continent
continent_species_counts = final_dataset_cleaned.groupby('continent')['species'].nunique()
print(continent_species_counts)
# Plotting unique species count per continent
plt.figure(figsize=(12, 6))
sns.barplot(x=continent_species_counts.index, y=continent_species_counts.values, palette="viridis")
plt.title('Species Count per Continent')
plt.xlabel('Continent')
plt.ylabel('Species Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
region_species_counts = final_dataset_cleaned.groupby('region')['species'].nunique()
print(region_species_counts)
# Plotting unique species count per region
plt.figure(figsize=(14, 8))
sns.barplot(x=region_species_counts.index, y=region_species_counts.values, palette="viridis")
plt.title('Species Count per Region')
plt.xlabel('Region')
plt.ylabel('Species Count')
plt.xticks(rotation=90)
plt.show()

## Interactive Geospatial Plots for Species Distribution

In [None]:
# Creating a base map
m = folium.Map(location=[0, 0], zoom_start=2)

# Adding markers for each unique species-location pair
for index, row in unique_species_data.iterrows():
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=row['species']
    ).add_to(m)

# Saving the map to an HTML file
m.save('unique_species_distribution_map.html')


m

## Exploring Introduced and Extinct species

In [None]:
introduced_species = final_dataset_cleaned[final_dataset_cleaned['introduced'] == 1]

unique_introduced_species = introduced_species.drop_duplicates(subset=['species', 'continent'])


introduced_per_continent = unique_introduced_species.groupby('continent').size().sort_values(ascending=False)

print(introduced_per_continent)

# Plotting number of unique plant species introduced in various continents
plt.figure(figsize=(14, 6))
introduced_per_continent.plot(kind='bar')
plt.title('Number of Unique Plant Species Introduced in Various Continents')
plt.xlabel('Continent')
plt.ylabel('Count')
plt.show()


In [None]:
extinct_species = final_dataset_cleaned[final_dataset_cleaned['extinct'] == 1]

# Remove duplicate species based on their identifier
unique_extinct_species = extinct_species.drop_duplicates(subset=['species', 'continent'])

# Group by continent and count unique introduced species
extinct_per_continent = unique_extinct_species.groupby('continent').size().sort_values(ascending=False)

print(extinct_per_continent)

# Plotting number of unique plant species extinct in various continents
plt.figure(figsize=(14, 6))
extinct_per_continent.plot(kind='bar')
plt.title('Number of Unique Plant Species extinct in Various Continents')
plt.xlabel('Continent')
plt.ylabel('Count')
plt.show()

# Clustering analysis on the geographical data

### By Location - Latitude and Longitude

In [None]:
# Ensure the data is unique by species and location
unique_species_data = final_dataset_cleaned.drop_duplicates(subset=['species', 'latitude', 'longitude'])
print(unique_species_data.shape)

In [None]:
from sklearn.cluster import KMeans

# Extracting latitude and longitude for clustering
X = unique_species_data[['latitude', 'longitude']]

# Determining the optimal number of clusters using the elbow method
inertia = []
k_range = range(1, 15)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=0, n_init=10).fit(X)
    inertia.append(kmeans.inertia_)

# Plotting the elbow curve
plt.figure(figsize=(8, 5))
plt.plot(k_range, inertia, 'bo-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
# KMeans clustering
optimal_k = 4  
kmeans = KMeans(n_clusters=optimal_k, random_state=0, n_init=10).fit(X)

unique_species_data['Cluster'] = kmeans.labels_

# Visualizing the clusters
plt.figure(figsize=(10, 6))
plt.scatter(unique_species_data['longitude'], unique_species_data['latitude'], c=unique_species_data['Cluster'], cmap='viridis', marker='o')
plt.colorbar(label='Cluster')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Species Clustering Based on Geographical Data')
plt.show()


In [None]:
# Creating a base map
m = folium.Map(location=[0, 0], zoom_start=2)

colors = ['red', 'blue', 'green', 'purple']


for index, row in unique_species_data.iterrows():
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=5,
        popup=f"Species: {row['species']}\nCluster: {row['Cluster']}",
        color=colors[row['Cluster']],
        fill=True,
        fill_color=colors[row['Cluster']]
    ).add_to(m)

# Saving the map to an HTML file
m.save('Species_clusters_map.html')


## By Region

In [None]:
species_count_per_region = final_dataset_cleaned.groupby('region')['species'].nunique().reset_index()
species_count_per_region.columns = ['region', 'speciescount']


In [None]:
final_df = final_dataset_cleaned.merge(species_count_per_region, on='region', how='left')


In [None]:
final_df.isnull().sum()

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4, random_state=42)
final_df['KMeans_Cluster'] = kmeans.fit_predict(final_df[['latitude', 'longitude', 'speciescount']])


In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(data=final_df, x='longitude', y='latitude', hue='KMeans_Cluster', palette='viridis')
plt.title('K-Means Clustering of Species Distribution')
plt.show()


# Kernel Density Estimation

In [None]:
# Ensure each row represents a unique species-location pair
unique_species_locations = final_dataset_cleaned.drop_duplicates(subset=['species', 'latitude', 'longitude'])

# Extracting the latitude and longitude columns
latitudes = unique_species_locations['latitude']
longitudes = unique_species_locations['longitude']

# Plotting the KDE
plt.figure(figsize=(10, 6))
kde = sns.kdeplot(x=longitudes, y=latitudes, cmap="viridis", fill=True, bw_adjust=0.5)
plt.title('Kernel Density Estimation of Unique Species Distribution Across Multiple Locations')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()

# Spatial Autocorrelation

In [None]:
from esda.moran import Moran
from libpysal.weights import Queen
from shapely.geometry import Point

RANDOM_SEED = 42

# Assuming final_dataset_cleaned contains columns 'latitude', 'longitude', and 'species'

# Dropping duplicates to ensure unique species per geographical point
unique_species_per_location = final_dataset_cleaned.drop_duplicates(subset=['species', 'longitude', 'latitude'])

# Sampling the data to reduce size (10% of the unique species per location data)
sample_size = int(len(unique_species_per_location) * 0.1)
sampled_species = unique_species_per_location.sample(n=sample_size, random_state=RANDOM_SEED)

# Creating a GeoDataFrame
gdf = gpd.GeoDataFrame(sampled_species, geometry=gpd.points_from_xy(sampled_species.longitude, sampled_species.latitude))

# Calculating spatial weights (using Queen contiguity as an example)
weights = Queen.from_dataframe(gdf)

gdf['species_count'] = gdf.groupby('species')['species'].transform('count')

# Computing Moran's I
y = gdf['species_count'].values
moran = Moran(y, weights)

# Printing Moran's I and p-value
print(f"Moran's I: {moran.I}")
print(f"P-value: {moran.p_sim}")

# Plotting the results
fig, ax = plt.subplots(1, 1, figsize=(8, 6))
ax.hist(moran.sim, bins=30)
plt.axvline(moran.I, color='r')
plt.title("Moran's I simulation distribution")
plt.show()

In [None]:
# Plotting the results
fig, ax = plt.subplots(1, 1, figsize=(20, 8))
ax.hist(moran.sim, bins=50)
plt.axvline(moran.I, color='r', linestyle='dashed', linewidth=2)
plt.title("Moran's I Simulation Distribution")
plt.xlabel("Simulated Moran's I")
plt.ylabel("Frequency")
plt.show()

In [None]:
import contextily as ctx  # For basemap

# Plotting the GeoDataFrame
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
gdf.plot(column='species_count', cmap='OrRd', legend=True, ax=ax)
plt.title('Species Count per Region')

# Adding basemap
ctx.add_basemap(ax, crs=gdf.crs.to_string(), source=ctx.providers.Stamen.TerrainBackground)
plt.show()

In [None]:
from sklearn.cluster import KMeans

# Dropping duplicates based on species, latitude, and longitude
unique_species_location = final_dataset_cleaned.drop_duplicates(subset=['species', 'latitude', 'longitude'])

# Creating a GeoDataFrame with unique species-location pairs
gdf = gpd.GeoDataFrame(unique_species_location, geometry=gpd.points_from_xy(unique_species_location.longitude, unique_species_location.latitude))
gdf.set_crs(epsg=4326, inplace=True)  # Set the CRS to WGS84 (EPSG:4326)

# Extracting coordinates for clustering
coords = np.array(list(zip(gdf.geometry.x, gdf.geometry.y)))

# Performing KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=0)  # Adjust the number of clusters (n_clusters) as needed
gdf['cluster_labels'] = kmeans.fit_predict(coords)

# Plotting the GeoDataFrame with cluster labels
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
gdf.plot(column='cluster_labels', cmap='viridis', legend=True, ax=ax)
plt.title('Clusters of Species Distribution')


# Adding basemap
try:
    ctx.add_basemap(ax, crs=gdf.crs.to_string(), source=ctx.providers.CartoDB.Positron)
except Exception as e:
    print(f"Error adding basemap: {e}")

plt.show()