import the necessary libraries

In [1]:
import folium
import pandas as pd
from ipywidgets import interact, widgets
from IPython.display import display, clear_output
from sklearn.cluster import DBSCAN

### This code defines several functions for preprocessing and visualizing data on a map.

The "preprocess_data" function reads a CSV file located at the given file path, extracts the
longitude and latitude from the 'Lat Long' column, and drops the 'Lat Long' column from
the dataframe. It returns the preprocessed dataframe.

function "adds_points_to_an_initial_map" based on the given dataframe and the number of points. It iterates over the first "num_points" rows of
the dataframe, creates a marker for each row using the latitude and longitude values, and adds it to the initial map.

The "cluster_and_add_to_new_map" based on the given dataframe. It uses the DBSCAN clustering algorithm to cluster the
latitude and longitude coordinates. It then iterates over the first "num_points" rows of the dataframe, assigns a cluster color to each row based on its cluster label, and adds a marker to the new map.

The "update_map_and_cluster" function is an interactive function that updates the map
and performs clustering based on the given parameters. It calls the "add_points_to_initial_map" and "cluster_and_add_to_new_map" specified parameters.

Overall, this code snippet preprocesses data from a CSV file, creates an initial map with
markers, and generates a new map with clustered markers based on the data.

In [2]:
def preprocess_data(file_path):
    """
    Preprocesses data by reading a CSV file located at the given file path,
    extracting the longitude and latitude from the 'Lat Long' column,
    and dropping the 'Lat Long' column from the dataframe.

    Parameters:
    - file_path (str): The path to the CSV file.

    Returns:
    - df (pandas.DataFrame): The preprocessed dataframe.
    """
    df = pd.read_csv(file_path)
    df[['Longitude', 'Latitude']] = df['Lat Long'].str.extract(r'POINT\((.*?) (.*?)\)').astype(float)
    df = df.drop('Lat Long', axis=1)
    return df

# Load and preprocess data
df = preprocess_data("DC-DATA_2.csv")

# Create initial map
initial_map = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=10)

# Function to add points to the initial map
def add_points_to_initial_map(df, num_points):
    """
    Add points to the initial map based on the given dataframe and the number of points.

    Parameters:
        df (pandas.DataFrame): The dataframe containing the latitude, longitude, and order number.
        num_points (int): The number of points to add to the map.

    Returns:
        None
    """
    for index, row in df.head(num_points).iterrows():
        folium.Marker([row['Latitude'], row['Longitude']],
                      popup=f"Order Number: {row['order_number']}").add_to(initial_map)
    display(initial_map)

# Function to cluster points using DBSCAN and add to a new map
def cluster_and_add_to_new_map(df, num_points, eps, min_samples):
    """
    Generates a new folium map with clustered markers based on the given data frame.

    Parameters:
    - df (pandas.DataFrame): The data frame containing the latitude and longitude coordinates.
    - num_points (int): The number of points to display on the map.
    - eps (float): The maximum distance between two samples for them to be considered as in the same neighborhood.
    - min_samples (int): The number of samples in a neighborhood for a point to be considered as a core point.

    Returns:
    - None
    """
    new_map = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=10)

    coords = df[['Latitude', 'Longitude']].values
    clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(coords)
    df['cluster'] = clustering.labels_

    colors = ['red', 'blue', 'green', 'purple', 'orange', 'darkred', 'lightred', 'beige', 'darkblue', 'darkgreen']
    for index, row in df.head(num_points).iterrows():
        cluster_color = colors[df.at[index, 'cluster'] % len(colors)]
        folium.Marker([row['Latitude'], row['Longitude']],
                      popup=f"Order Number: {row['order_number']}, Cluster: {row['cluster']}",
                      icon=folium.Icon(color=cluster_color)).add_to(new_map)
    display(new_map)

# Create the interactive function to update the map and cluster points
@interact(num_points=widgets.IntSlider(min=1, max=len(df), step=10, value=100),
          eps=widgets.FloatSlider(min=0.001, max=0.1, step=0.001, value=0.01),
          min_samples=widgets.IntSlider(min=1, max=20, step=1, value=5))
def update_map_and_cluster(num_points, eps, min_samples):
    """
    Updates the map and performs clustering based on the given parameters.

    Parameters:
        num_points (int): The number of points to add to the initial map.
        eps (float): The maximum distance between two points to be considered in the same neighborhood.
        min_samples (int): The minimum number of points required to form a dense region.

    Returns:
        None
    """
    add_points_to_initial_map(df, num_points)
    cluster_and_add_to_new_map(df, num_points, eps=eps, min_samples=min_samples)

interactive(children=(IntSlider(value=100, description='num_points', max=23339, min=1, step=10), FloatSlider(v…

### Using the DBSCAN algorithm and displays the clusters on a Folium interactive map. 

The function takes four parameters: df (a pandas DataFrame containing latitude and longitude coordinates), "num_points" (the number of points to display for each cluster on the map), "eps" (the maximum distance between two samples for them to be considered as in the same neighborhood), and "min_samples"(the number of samples in a neighborhood for a point to be considered as a core point). 

The function uses the DBSCAN algorithm from the sklearn.cluster module to cluster the points and assigns the cluster labels to the DataFrame. It then uses the Folium library to create a map and display the clusters using markers with different colors.

The function also provides *interactive* functionality to display specific clusters or all clusters on the map, with a dynamic legend showing the cluster colors. 

Finally, it includes an example usage of the "cluster_points" function with some parameter values.

-----> -1 means outliers in data

In [3]:
# Function to cluster points using DBSCAN and add to a new map
def cluster_points(df, num_points, eps, min_samples):
    """
    Clusters the given data points using the DBSCAN algorithm and displays the clusters on a Folium map.

    Parameters:
    - df: pandas DataFrame, the input data containing the latitude and longitude coordinates.
    - num_points: int, the number of points to display for each cluster on the map.
    - eps: float, the maximum distance between two samples for them to be considered as in the same neighborhood.
    - min_samples: int, the number of samples in a neighborhood for a point to be considered as a core point.

    Returns:
    - None
    """
    coords = df[['Latitude', 'Longitude']].values
    clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(coords)
    df['cluster'] = clustering.labels_

    # Using an extended set of Folium colors
    all_folium_colors = [
        'red', 'blue', 'green', 'purple', 'orange', 'darkred', 'lightred', 'beige', 'darkblue', 'darkgreen',
        'cadetblue', 'darkpurple', 'pink', 'lightgreen', 'gray', 'black', 'lightblue', 'white', 'darkgray',
        'lightgray', 'lightpurple', 'bluepurple', 'darkblue', 'lightblue', 'blue', 'green', 'purple', 'orange',
        'darkred', 'lightred', 'beige', 'darkblue', 'darkgreen', 'cadetblue', 'darkpurple', 'pink', 'lightgreen',
        'gray', 'black', 'lightblue', 'white', 'darkgray', 'lightgray', 'lightpurple', 'bluepurple', 'darkblue',
        'lightblue', 'blue', 'green', 'purple', 'orange', 'darkred', 'lightred', 'beige', 'darkblue', 'darkgreen',
        'cadetblue', 'darkpurple', 'pink', 'lightgreen', 'gray', 'black', 'lightblue', 'white', 'darkgray',
        'lightgray', 'lightpurple', 'bluepurple', 'darkblue', 'lightblue'
    ]

    # Function to display the map for a specific cluster number or all clusters
    def display_cluster(cluster_number):
        if cluster_number == 'Show All':
            filtered_groups = df.copy()
        else:
            filtered_groups = df[df['cluster'] == cluster_number]

        filtered_clusters = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=10)

        for name, group in filtered_groups.groupby('cluster'):
            cluster_color = all_folium_colors[name % len(all_folium_colors)]
            for idx, row in group.head(num_points).iterrows():
                folium.Marker([row['Latitude'], row['Longitude']],
                              popup=f"Order Number: {row['order_number']}, Cluster: {row['cluster']}",
                              icon=folium.Icon(color=cluster_color)).add_to(filtered_clusters)

        # Create legend dynamically
        legend_html = '''
                     <div style="position: fixed;
                                 top: 10px; right: 10px; width: 120px; height: 120px;
                                 border:2px solid grey; z-index:9999; font-size:12px;
                                 ">&nbsp; Cluster Legend <br>'''
        if cluster_number == 'Show All':
            legend_html += '&nbsp; <i class="fa fa-map-marker fa-1x" style="color:gray"></i> Show All<br>'
        else:
            legend_html += f'&nbsp; <i class="fa fa-map-marker fa-1x" style="color:{cluster_color}"></i> Cluster {cluster_number}<br>'
        legend_html += '</div>'
        filtered_clusters.get_root().html.add_child(folium.Element(legend_html))

        return filtered_clusters

    # Function to update the map based on dropdown value
    def update_map(change):
        """
        Updates the map based on the change in the current cluster.

        Parameters:
            change (object): The change object containing the new cluster.

        Returns:
            None
        """
        current_cluster = change.new
        display_map(current_cluster)

    # Function to display the map
    def display_map(cluster_number):
        """
        Display the map for a given cluster number.

        Args:
            cluster_number (int): The number of the cluster to display.

        Returns:
            None
        """
        clear_output(wait=True)
        display(widgets.HBox([widgets.Label('Select Cluster:', layout=widgets.Layout(width='100px')), cluster_selector]))
        display(display_cluster(cluster_number))

    unique_clusters = df['cluster'].unique()
    cluster_selector = widgets.Dropdown(
        options=['Show All'] + sorted(unique_clusters),
        disabled=False,
    )
    cluster_selector.observe(update_map, names='value')

    display_map('Show All')




# Usage of the clustering function
cluster_points(df, num_points=100, eps=0.01, min_samples=5)  # Adjust parameters as needed


HBox(children=(Label(value='Select Cluster:', layout=Layout(width='100px')), Dropdown(options=('Show All', -1,…