In [1]:
import pandas as pd
from geopy.distance import geodesic

# Load the dataset
df = pd.read_csv(r'/u01/jupyter-scripts/TonyS/VCIS/device_history_imsi_30000rows.csv', index_col=0)

print(df.columns)
df

Index(['service_provider_id', 'imsi_id', 'usage_timeframe', 'cgi_id',
       'data_source_id', 'data_type', 'imei_id', 'location_azimuth',
       'location_latitude', 'location_longitude', 'phone_number', 'type_id'],
      dtype='object')


Unnamed: 0,service_provider_id,imsi_id,usage_timeframe,cgi_id,data_source_id,data_type,imei_id,location_azimuth,location_latitude,location_longitude,phone_number,type_id
0,10,151967466595180,1704797013785,10521857,,,509593926863172,60,33.8901,35.5566,96103311097,
1,10,151967466595180,1704797058783,10521857,,,509593926863172,60,33.8901,35.5566,96103311097,
2,10,151967466595180,1704797130988,10521857,,,509593926863172,60,33.8901,35.5566,96103311097,
3,10,151967466595180,1704797204502,10521857,,,509593926863172,60,33.8901,35.5566,96103311097,
4,10,151967466595180,1704797204520,10521857,,,509593926863172,60,33.8901,35.5566,96103311097,
...,...,...,...,...,...,...,...,...,...,...,...,...
29995,10,151967466595180,1705563474833,10556674,,,509593926863172,170,33.8737,35.5149,96103311097,
29996,10,151967466595180,1705563478949,10570753,,,509593926863172,110,33.8693,35.5124,96103311097,
29997,10,151967466595180,1705563517836,10570753,,,509593926863172,110,33.8693,35.5124,96103311097,
29998,10,151967466595180,1705563517854,10570753,,,509593926863172,110,33.8693,35.5124,96103311097,


In [None]:
# Method 1: Complex 
# import pandas as pd
import numpy as np
from geopy.distance import geodesic
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, LabelEncoder
# from statsmodels.tsa.arima.model import ARIMA
import folium


# Function to calculate haversine distance between two points
def haversine_distance(lat1, lon1, lat2, lon2):
    point1 = (lat1, lon1)
    point2 = (lat2, lon2)
    return geodesic(point1, point2).kilometers


# Calculate distance between successive points (prior and subsequent)
df['Distance_Prior'] = df.apply(
    lambda row: haversine_distance(
        row['location_latitude'], row['location_longitude'],
        df.loc[row.name - 1, 'location_latitude'], df.loc[row.name - 1, 'location_longitude']
    ) if row.name > 0 else None,  # Handle first row (no prior row)
    axis=1
)

df['Distance_Subsequent'] = df.apply(
    lambda row: haversine_distance(
        row['location_latitude'], row['location_longitude'],
        df.loc[row.name + 1, 'location_latitude'], df.loc[row.name + 1, 'location_longitude']
    ) if row.name < len(df) - 1 else None,  # Handle last row (no next row)
    axis=1
)


df['START_DATE'] = pd.to_datetime(df['START_DATE'])

# Calculate time difference in hours (prior and subsequent)
df['TimeDiff_Prior'] = df['START_DATE'].apply(lambda x: int(x.timestamp())).diff() / 3600  # convert to hours
df['TimeDiff_Subsequent'] = df['START_DATE'].apply(lambda x: int(x.timestamp())).diff().shift(-1) / 3600  # convert to hours

# Calculate speed based on prior and subsequent distances and time differences
df['Speed_Prior'] = df['Distance_Prior'] / df['TimeDiff_Prior']
df['Speed_Subsequent'] = df['Distance_Subsequent'] / df['TimeDiff_Subsequent']

df.replace([np.inf, -np.inf], np.nan, inplace=True)
# Feature selection
features = df[['CELL_ID_START','location_latitude', 'location_longitude', 'Distance_Prior', 'Speed_Prior', 'Distance_Subsequent', 'Speed_Subsequent']].fillna(0)

# Encode categorical string features
label_encoder = LabelEncoder()
features["CELL_ID_START"] = label_encoder.fit_transform(features['CELL_ID_START'])


# Step 1: Distance-based anomaly detection
distance_threshold = 10  # in kilometers
df['DistanceAnomaly'] = (df['Distance_Prior'] > distance_threshold) | (df['Distance_Subsequent'] > distance_threshold)

# Step 2: Speed-based anomaly detection
speed_threshold = 120  # in km/h
df['SpeedAnomaly'] = (df['Speed_Prior'] > speed_threshold) | (df['Speed_Subsequent'] > speed_threshold)


# Step 3: Clustering-based anomaly detection (DBSCAN)
clustering_data = StandardScaler().fit_transform(features)
db = DBSCAN(eps=0.1, min_samples=5).fit(clustering_data)
df['ClusterAnomaly'] = db.labels_ == -1

# Step 4: Statistical anomaly detection (z-score)
# Separate zero and non-zero values for Distance and Speed
non_zero_distance_prior = df[df['Distance_Prior'] != 0]['Distance_Prior']
non_zero_distance_subsequent = df[df['Distance_Subsequent'] != 0]['Distance_Subsequent']
non_zero_speed_prior = df[df['Speed_Prior'] != 0]['Speed_Prior']
non_zero_speed_subsequent = df[df['Speed_Subsequent'] != 0]['Speed_Subsequent']

df['Distance_zscore_Prior'] = (df['Distance_Prior'] - non_zero_distance_prior.mean()) / non_zero_distance_prior.std()
df['Distance_zscore_Subsequent'] = (df['Distance_Subsequent'] - non_zero_distance_subsequent.mean()) / non_zero_distance_subsequent.std()
df['Speed_zscore_Prior'] = (df['Speed_Prior'] - non_zero_speed_prior.mean()) / non_zero_speed_prior.std()
df['Speed_zscore_Subsequent'] = (df['Speed_Subsequent'] - non_zero_speed_subsequent.mean()) / non_zero_speed_subsequent.std()

zscore_threshold = 3
df['DistanceStatAnomaly'] = ((df['Distance_zscore_Prior'].abs() > zscore_threshold) | (df['Distance_zscore_Subsequent'].abs() > zscore_threshold))
df['SpeedStatAnomaly'] = ((df['Speed_zscore_Prior'].abs() > zscore_threshold) | (df['Speed_zscore_Subsequent'].abs() > zscore_threshold))


# Step 5: Machine learning-based anomaly detection (Isolation Forest)
isolation_forest = IsolationForest(contamination='auto', random_state=42)
isolation_forest.fit(features)
df['IsolationForestAnomaly'] = isolation_forest.predict(features) == -1

# 6: Adding Cell ID changes to the anomaly score
df['Previous_CellID'] = df['CELL_ID_START'].shift(1)
df['Next_CellID'] = df['CELL_ID_START'].shift(-1)

df['CellChangePrior'] = (df['CELL_ID_START'] != df['Previous_CellID']).astype(int)
df['CellChangeSubsequent'] = (df['CELL_ID_START'] != df['Next_CellID']).astype(int)


# # Step 7: Time Series Analysis
# # Prepare time series data (assuming 'Timestamp' is the index)
# time_analysis = df.set_index('usage_timeframe', inplace=True)
# # Fit the ARIMA model
# model = ARIMA(time_analysis['Speed'], order=(5, 1, 0))
# model_fit = model.fit(disp=0)
# # Calculate residuals
# residuals = model_fit.resid
# # Define an anomaly as a point where the residual is outside 3 standard deviations
# df['TimeSeriesAnomaly'] = np.abs(residuals) > 3 * np.std(residuals)

# Step 8: Combine all anomaly flags
df['AnomalyScore'] = (
    df['DistanceAnomaly'].astype(int) +
    df['SpeedAnomaly'].astype(int) +
    df['ClusterAnomaly'].astype(int) +
    df['DistanceStatAnomaly'].astype(int) +
    df['SpeedStatAnomaly'].astype(int) +
    df['IsolationForestAnomaly'].astype(int) + (df['CellChangePrior'] * 0.5) +(df['CellChangeSubsequent'] * 0.5)
    # + df['TimeSeriesAnomaly'].astype(int)
)

# Define an anomaly as a point where the anomaly score exceeds a threshold
anomaly_threshold = 2  # Adjust based on desired sensitivity
df['Overshoot'] = df['AnomalyScore'] >= anomaly_threshold


# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)  # Adjust the number of rows as needed
pd.set_option('display.max_columns', None)  # Adjust the number of rows as needed
# Display the DataFrame with the final anomaly column
print(df["Overshoot"].value_counts())
# Display the DataFrame with overshoot flags
# df.tail(100)


In [9]:
# # # # # import pandas as pd
# # # # # import plotly.express as px
# # # # # import plotly.graph_objs as go




# # # # # def plot_trace(df, hover_cols):
# # # # #     # #df['START_DATE'] = pd.to_datetime(df['usage_timeframe '], unit='ms').astype("str")
# # # # #     #  Sort by START_DATE (ensure data is sorted for proper animation)
# # # # #     df["START_DATE"] = df["START_DATE"].astype("str")
# # # # #     df.sort_values('START_DATE', inplace=True)


# # # # #     # # Create Plotly Express animated scatter mapbox
# # # # #     # fig_px = px.scatter_mapbox(df[:100], lat="location_latitude ", lon="location_longitude ", color="Overshoot", animation_frame="START_DATE",
# # # # #     #                            color_discrete_map={False: 'green', True: 'red'},
# # # # #     #                            zoom=12, center={"lat": df['location_latitude '].mean(), "lon": df['location_longitude '].mean()},
# # # # #     #                            title="Points Over Time")
# # # # #     # fig_px.update_layout(mapbox_style="carto-positron")
# # # # #     # fig_px.update_layout(margin={"r": 0, "t": 30, "l": 0, "b": 0})

# # # # #     # # Extract data and frames from Plotly Express figure
# # # # #     # data = fig_px.data
# # # # #     # frames = fig_px.frames

# # # # #     # # Create the new figure with extracted data and frames
# # # # #     # fig = go.Figure(data=data, frames=frames)

# # # # #     # # Set up the layout
# # # # #     # fig.update_layout(
# # # # #     #     mapbox=dict(
# # # # #     #         style="carto-positron",
# # # # #     #         zoom=12,
# # # # #     #         center={"lat": df['location_latitude '].mean(), "lon": df['location_longitude '].mean()}
# # # # #     #     ),
# # # # #     #     margin={"r": 0, "t": 30, "l": 0, "b": 0},
# # # # #     #     title="Points Over Time",
# # # # #     # )

# # # # #     # # Create slider steps for animation frames
# # # # #     # steps = []
# # # # #     # for i, frame in enumerate(frames):
# # # # #     #     step = dict(
# # # # #     #         method='animate',
# # # # #     #         args=[[frame.name], 
# # # # #     #               {"frame": {"duration": 500, "redraw": True}, 
# # # # #     #                "mode": "immediate",
# # # # #     #                "transition": {"duration": 300}}],
# # # # #     #         label=frame.name
# # # # #     #     )
# # # # #     #     steps.append(step)

# # # # #     # # Add the frame slider
# # # # #     # frame_slider = dict(
# # # # #     #     active=0,
# # # # #     #     pad={"t": 50},
# # # # #     #     steps=steps
# # # # #     # )

# # # # #     # # Create speed control slider
# # # # #     # speed_slider = dict(
# # # # #     #     x=0.1,
# # # # #     #     y=-0.1,
# # # # #     #     currentvalue={"prefix": "Speed: "},
# # # # #     #     pad={"b": 10},
# # # # #     #     steps=[
# # # # #     #         {"label": "Slow", "method": "animate", "args": [None, {"frame": {"duration": 2000, "redraw": True}, "mode": "immediate", "transition": {"duration": 500}}]},
# # # # #     #         {"label": "Medium", "method": "animate", "args": [None, {"frame": {"duration": 1000, "redraw": True}, "mode": "immediate", "transition": {"duration": 300}}]},
# # # # #     #         {"label": "Fast", "method": "animate", "args": [None, {"frame": {"duration": 500, "redraw": True}, "mode": "immediate", "transition": {"duration": 100}}]},
# # # # #     #     ]
# # # # #     # )

# # # # #     # # Update layout with sliders
# # # # #     # fig.update_layout(
# # # # #     #     sliders=[frame_slider, speed_slider]
# # # # #     # )

# # # # #     # # Display the interactive plot
# # # # #     # fig.show()
# # # # #     # Specify the columns to display in the hover data

# # # # #     # Plotly express animated scatter mapbox
# # # # #     #################################################
# # # # #     fig = px.scatter_mapbox(df, lat="location_latitude", lon="location_longitude", color= df.apply(assign_color, axis=1) , animation_frame="START_DATE",
# # # # #                             # color_discrete_map={False: 'green', True: 'red'},
# # # # #                             zoom=12, center={"lat": df['location_latitude'].mean(), "lon": df['location_longitude'].mean()},
# # # # #                             title="Points Over Time"
# # # # #                             ,hover_data= hover_cols)
# # # # #     fig.update_layout(mapbox_style="carto-positron")
# # # # #     fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0})
# # # # #     fig.show()
# # # # # #################################################

# # # # #     # Save the figure as HTML file with CDN for Plotly.js
# # # # #     fig.write_html("animated_map.html")
    
    
# # # # # def assign_color(row):
# # # # #     if row['potential_overshoot'] and row['potential_session_end'] and row['potential_newly_active_location']:
# # # # #         return 'red'
# # # # #     elif row['potential_overshoot'] and row['potential_session_end'] and not row['potential_newly_active_location']:
# # # # #         return 'orange'
# # # # #     elif row['potential_overshoot'] and not row['potential_session_end'] and row['potential_newly_active_location']:
# # # # #         return 'purple'
# # # # #     elif row['potential_overshoot'] and not row['potential_session_end'] and not row['potential_newly_active_location']:
# # # # #         return 'brown'
# # # # #     elif not row['potential_overshoot'] and row['potential_session_end'] and row['potential_newly_active_location']:
# # # # #         return 'yellow'
# # # # #     elif not row['potential_overshoot'] and row['potential_session_end'] and not row['potential_newly_active_location']:
# # # # #         return 'blue'
# # # # #     elif not row['potential_overshoot'] and not row['potential_session_end'] and row['potential_newly_active_location']:
# # # # #         return 'pink'
# # # # #     else:
# # # # #         return 'green'


# import pandas as pd
# import plotly.graph_objs as go

# def plot_trace(df, hover_cols):
#     df["START_DATE"] = pd.to_datetime(df["START_DATE"], unit='ms').astype("str")
#     df.sort_values('START_DATE', inplace=True)
    
#     # Initialize the figure
#     fig = go.Figure()

#     # Create a list to hold the frames
#     frames = []

#     # Iterate through the DataFrame to create frames
#     for i, row in df.iterrows():
#         lat = row['location_latitude']
#         lon = row['location_longitude']
        
#         # Initialize traces for this frame
#         traces = []
        
#         # Add potential overshoot trace if condition is met
#         if row['potential_overshoot']:
#             trace1 = go.Scattermapbox(
#                 lat=[lat],
#                 lon=[lon],
#                 mode='markers',
#                 marker=go.scattermapbox.Marker(
#                     size=25,
#                     color='red',
#                     opacity=0.6
#                 ),
#                 text=f"potential_overshoot: {row['potential_overshoot']}",
#                 name='potential_overshoot'
#             )
#             traces.append(trace1)
#         else:
#             # Add a dummy trace if condition not met to maintain consistent count
#             traces.append(go.Scattermapbox(
#                 lat=[lat],
#                 lon=[lon],
#                 mode='markers',
#                 marker=go.scattermapbox.Marker(
#                     size=0,
#                     color='rgba(0,0,0,0)'
#                 ),
#                 # text="dummy trace",
#                 # name='dummy_trace'
#             ))
        
#         # Add potential session end trace if condition is met
#         if row['potential_session_end']:
#             trace2 = go.Scattermapbox(
#                 lat=[lat],
#                 lon=[lon],
#                 mode='markers',
#                 marker=go.scattermapbox.Marker(
#                     size=15,
#                     color='green',
#                     opacity=0.6
#                 ),
#                 text=f"potential_session_end", #{row['potential_session_end']}",
#                 name='potential_session_end'
#             )
#             traces.append(trace2)
#         else:
#             # Add a dummy trace if condition not met to maintain consistent count
#             traces.append(go.Scattermapbox(
#                 lat=[lat],
#                 lon=[lon],
#                 mode='markers',
#                 marker=go.scattermapbox.Marker(
#                     size=0,
#                     color='rgba(0,0,0,0)'
#                 ),
#                 # text="dummy trace",
#                 # name='dummy_trace'
#             ))

#         # Add potential newly active location trace if condition is met
#         if row['potential_newly_active_location']:
#             trace3 = go.Scattermapbox(
#                 lat=[lat],
#                 lon=[lon],
#                 mode='markers',
#                 marker=go.scattermapbox.Marker(
#                     size=10,
#                     color='blue',
#                     opacity=0.6
#                 ),
#                 text=f"potential_newly_active_location: {row['potential_newly_active_location']}",
#                 name='potential_newly_active_location'
#             )
#             traces.append(trace3)
#         else:
#             # Add a dummy trace if condition not met to maintain consistent count
#             traces.append(go.Scattermapbox(
#                 lat=[lat],
#                 lon=[lon],
#                 mode='markers',
#                 marker=go.scattermapbox.Marker(
#                     size=0,
#                     color='rgba(0,0,0,0)'
#                 ),
#                 # text="dummy trace",
#                 # name='dummy_trace'
#             ))
#         if not row['potential_overshoot'] and not row["potential_session_end"] and not row["potential_newly_active_location"]:
#             trace1 = go.Scattermapbox(
#                 lat=[lat],
#                 lon=[lon],
#                 mode='markers',
#                 marker=go.scattermapbox.Marker(
#                     size=20,
#                     color='black',
#                     opacity=0.6
#                 ),
#                 text=f"All False: ",
#                 name='No conditions met'
#             )
#             traces.append(trace1)
#         else:
#             # Add a dummy trace if condition not met to maintain consistent count
#             traces.append(go.Scattermapbox(
#                 lat=[lat],
#                 lon=[lon],
#                 mode='markers',
#                 marker=go.scattermapbox.Marker(
#                     size=0,
#                     color='rgba(0,0,0,0)'
#                 ),
#                 # text="dummy trace",
#                 # name='dummy_trace'
#             ))    
#         # Create a frame for this time step with all traces
#         frames.append(go.Frame(data=traces, name=row['START_DATE']))

#     # Add the first frame to the figure
#     if frames:
#         for trace in frames[0].data:
#             fig.add_trace(trace)

#     fig.update_layout(
#         mapbox_style="carto-positron",
#         mapbox_zoom=12,
#         mapbox_center={"lat": df['location_latitude'].mean(), "lon": df['location_longitude'].mean()},
#         title="Points Over Time"
#     )

#     # Add frames to the figure
#     fig.update(frames=frames)

#     # Create slider steps
#     slider_steps = []
#     for i, frame in enumerate(frames):
#         step = dict(
#             method="animate",
#             args=[[frame.name], dict(frame=dict(duration=0, redraw=True), mode='immediate')],
#             label=str(frame.name)  # Use START_DATE as label
#         )
#         slider_steps.append(step)

#     # Add slider
#     fig.update_layout(
#         sliders=[dict(
#             steps=slider_steps,
#             active=len(frames)-1,
#             currentvalue={"prefix": "Time: "},
#         )]
#     )

#     # Add animation controls
#     fig.update_layout(
#         updatemenus=[dict(
#             type="buttons",
#             showactive=False,
#             buttons=[
#                 dict(label="Play",
#                      method="animate",
#                      args=[None, dict(frame=dict(duration=500, redraw=True), fromcurrent=True, mode='immediate')]),
#                 dict(label="Pause",
#                      method="animate",
#                      args=[[None], dict(frame=dict(duration=0, redraw=False), mode='immediate')]),
#                 dict(label="Slow",
#                      method="animate",
#                      args=[None, dict(frame=dict(duration=2000, redraw=True), fromcurrent=True, mode='immediate')]),
#                 dict(label="Medium",
#                      method="animate",
#                      args=[None, dict(frame=dict(duration=1000, redraw=True), fromcurrent=True, mode='immediate')]),
#                 dict(label="Fast",
#                      method="animate",
#                      args=[None, dict(frame=dict(duration=500, redraw=True), fromcurrent=True, mode='immediate')])
#             ]
#         )]
#     )

#     # Display the interactive plot
#     fig.show()

#     # Save the figure as HTML file with CDN for Plotly.js
#     fig.write_html("animated_map.html")


import pandas as pd
import plotly.graph_objs as go

def plot_trace(df, hover_cols):
    df["START_DATE"] = pd.to_datetime(df["START_DATE"], unit='ms').astype("str")
    df.sort_values('START_DATE', inplace=True)
    
    # Initialize the figure
    fig = go.Figure()

    # Create a list to hold the frames
    frames = []

    # Iterate through the DataFrame to create frames
    for i, row in df.iterrows():
        lat = row['location_latitude']
        lon = row['location_longitude']
        # Generate hover text
        hover_text = f"Latitude: {lat}, Longitude: {lon}<br>"
        for col in hover_cols:
            hover_text += f"{col}: {row[col]}<br>"
            
            
        hoverinfo = 'text'
        # Initialize traces for this frame
        traces = []

        # Add potential overshoot trace if condition is met
        if row['potential_overshoot']:
            trace1 = go.Scattermapbox(
                lat=[lat],
                lon=[lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=25,
                    color='red',
                    opacity=1
                ),
                text=hover_text,
                name='Current: potential_overshoot'
            )
            traces.append(trace1)
        else:
            # Add a dummy trace if condition not met to maintain consistent count
            traces.append(go.Scattermapbox(
                lat=[lat],
                lon=[lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=0,
                    color='rgba(0,0,0,0)'
                ),
            ))    
        
        # Add potential session end trace if condition is met
        if row['potential_session_end']:
            trace2 = go.Scattermapbox(
                lat=[lat],
                lon=[lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=15,
                    color='green',
                    opacity=1
                ),
                text= hover_text,
                name='Current: potential_session_end'
            )
            traces.append(trace2)
        else:
            # Add a dummy trace if condition not met to maintain consistent count
            traces.append(go.Scattermapbox(
                lat=[lat],
                lon=[lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=0,
                    color='rgba(0,0,0,0)'
                ),
            ))   

        # Add potential newly active location trace if condition is met
        if not row['potential_newly_active_location']:
            trace3 = go.Scattermapbox(
                lat=[lat],
                lon=[lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=10,
                    color='blue',
                    opacity=1
                ),
                text=hover_text,
                name='Current: potential_newly_active_loc'
            )
            traces.append(trace3)
        else:
            # Add a dummy trace if condition not met to maintain consistent count
            traces.append(go.Scattermapbox(
                lat=[lat],
                lon=[lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=0,
                    color='rgba(0,0,0,0)'
                ),
            ))   
        
        if row['switching_overshoot']:
            trace3 = go.Scattermapbox(
                lat=[lat],
                lon=[lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=8.5,
                    color='purple',
                    opacity=1
                ),
                text=hover_text,
                name='Current: switching overshoot'
            )
            traces.append(trace3)
        else:
            # Add a dummy trace if condition not met to maintain consistent count
            traces.append(go.Scattermapbox(
                lat=[lat],
                lon=[lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=0,
                    color='rgba(0,0,0,0)'
                ),
            ))   
        if not row['potential_overshoot'] and not row["potential_session_end"]: #and  row["potential_newly_active_location"] :
            trace4 = go.Scattermapbox(
                lat=[lat],
                lon=[lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=20,
                    color='yellow',
                    opacity=1
                ),
                text=hover_text,
                # hoverinfo=
                name='Current: No conditions met'
            )
            traces.append(trace4)
        else:
            # Add a dummy trace if condition not met to maintain consistent count
            traces.append(go.Scattermapbox(
                lat=[lat],
                lon=[lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=0,
                    color='rgba(0,0,0,0)'
                ),
            ))   

        # If not the first point, add traces from the previous point
        if i > 0: #pd.to_datetime(df['START_DATE']).idxmin():
            prev_row = df.iloc[i - 1]
            prev_lat = prev_row['location_latitude']
            prev_lon = prev_row['location_longitude']
            prev_hover_text = f"Latitude: {prev_lat}, Longitude: {prev_lon}<br>"
            for col in hover_cols:
                prev_hover_text += f"{col}: {prev_row[col]}<br>"
        else:
            prev_row = df.iloc[0]
            prev_lat = prev_row['location_latitude']
            prev_lon = prev_row['location_longitude']
            prev_hover_text =None
            # Add potential overshoot trace from the previous point
        if prev_row['potential_overshoot']:
            prev_trace1 = go.Scattermapbox(
                lat=[prev_lat],
                lon=[prev_lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=22,
                    color='red',
                    opacity=0.6
                ),
                text=prev_hover_text,
                name='Previous: potential_overshoot'
            )
            traces.append(prev_trace1)
        else:
            # Add a dummy trace if condition not met to maintain consistent count
            traces.append(go.Scattermapbox(
                lat=[prev_lat],
                lon=[prev_lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=0,
                    color='rgba(0,0,0,0)'
                ),
            ))   
        
        # Add potential session end trace from the previous point
        if prev_row['potential_session_end']:
            prev_trace2 = go.Scattermapbox(
                lat=[prev_lat],
                lon=[prev_lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=13,
                    color='green',
                    opacity=0.6
                ),
                text=prev_hover_text,
                name='Previous: potential_session_end'
            )
            traces.append(prev_trace2)
        else:
            # Add a dummy trace if condition not met to maintain consistent count
            traces.append(go.Scattermapbox(
                lat=[prev_lat],
                lon=[prev_lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=0,
                    color='rgba(0,0,0,0)'
                ),
            ))               
        # Add potential newly active location trace from the previous point
        if not prev_row['potential_newly_active_location']:
            prev_trace3 = go.Scattermapbox(
                lat=[prev_lat],
                lon=[prev_lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=9,
                    color='blue',
                    opacity=0.6
                ),
                text=prev_hover_text,
                name='Previous: potential_newly_active_loc'
            )
            traces.append(prev_trace3)
        else:
            # Add a dummy trace if condition not met to maintain consistent count
            traces.append(go.Scattermapbox(
                lat=[prev_lat],
                lon=[prev_lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=0,
                    color='rgba(0,0,0,0)'
                ),
            ))   
        if prev_row['switching_overshoot']:
            prev_trace3 = go.Scattermapbox(
                lat=[prev_lat],
                lon=[prev_lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=7,
                    color='purple',
                    opacity=0.6
                ),
                text=prev_hover_text,
                name='Previous: switching_overshoot'
            )
            traces.append(prev_trace3)
        else:
            # Add a dummy trace if condition not met to maintain consistent count
            traces.append(go.Scattermapbox(
                lat=[prev_lat],
                lon=[prev_lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=0,
                    color='rgba(0,0,0,0)'
                ),
            ))               
        # Add "All False" trace from the previous point
        if not prev_row['potential_overshoot'] and not prev_row["potential_session_end"]: #and  prev_row["potential_newly_active_location"]:
            prev_trace4 = go.Scattermapbox(
                lat=[prev_lat],
                lon=[prev_lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=18,
                    color='yellow',
                    opacity=0.6
                ),
                text=prev_hover_text,
                name='Previous: No conditions met'
            )
            traces.append(prev_trace4)
        else:
            # Add a dummy trace if condition not met to maintain consistent count
            traces.append(go.Scattermapbox(
                lat=[prev_lat],
                lon=[prev_lon],
                mode='markers',
                marker=go.scattermapbox.Marker(
                    size=0,
                    color='rgba(0,0,0,0)'
                ),
            ))
    # Create a frame for this time step with all traces
        frames.append(go.Frame(data=traces, name=row['START_DATE']))
       
    # Add the first frame to the figure
    if frames:
        for trace in frames[0].data:
            fig.add_trace(trace)

    fig.update_layout(
        mapbox_style="carto-positron",
        mapbox_zoom=12,
        mapbox_center={"lat": df['location_latitude'].mean(), "lon": df['location_longitude'].mean()},
        title="Points Over Time"
    )

    # Add frames to the figure
    fig.update(frames=frames)

    # Create slider steps
    slider_steps = []
    for i, frame in enumerate(frames):
        step = dict(
            method="animate",
            args=[[frame.name], dict(frame=dict(duration=0, redraw=True), mode='immediate')],
            label=str(frame.name)  # Use START_DATE as label
        )
        slider_steps.append(step)

    # Add slider
    fig.update_layout(
        sliders=[dict(
            steps=slider_steps,
            active=len(frames)-1,
            currentvalue={"prefix": "Time: "},
        )]
    )

    # Add animation controls
    fig.update_layout(
        updatemenus=[dict(
            type="buttons",
            showactive=False,
            buttons=[
                dict(label="Play",
                     method="animate",
                     args=[None, dict(frame=dict(duration=500, redraw=True), fromcurrent=True, mode='immediate')]),
                dict(label="Pause",
                     method="animate",
                     args=[[None], dict(frame=dict(duration=0, redraw=False), mode='immediate')]),
                dict(label="Slow",
                     method="animate",
                     args=[None, dict(frame=dict(duration=2000, redraw=True), fromcurrent=True, mode='immediate')]),
                dict(label="Medium",
                     method="animate",
                     args=[None, dict(frame=dict(duration=1000, redraw=True), fromcurrent=True, mode='immediate')]),
                dict(label="Fast",
                     method="animate",
                     args=[None, dict(frame=dict(duration=500, redraw=True), fromcurrent=True, mode='immediate')])
            ]
        )]
    )

    # Display the interactive plot
    fig.show()

    # Save the figure as HTML file with CDN for Plotly.js
    fig.write_html("animated_map.html")




In [2]:
# Method 2: Simpler and more generalized
import pandas as pd
import numpy as np
from geopy.distance import geodesic, great_circle
from vcis.utils.utils import CDR_Utils
utils = CDR_Utils()
def get_common_ids(df, size=2000):
    def get_ids_within_threshold(df, point_value, threshold, axis='location_latitude'):
        # point_value = df.loc[df['cgi_id'] == point_id, axis].values[0]
        lower_bound = point_value - threshold
        upper_bound = point_value + threshold
        ids_within_threshold = df[(df[axis] >= lower_bound) & (df[axis] <= upper_bound)]['cgi_id'].tolist()
        return ids_within_threshold

    step_lat, step_lon = utils.get_step_size(size)
    # df_sorted_x = df.sort_values(by='location_latitude').copy()
    # df_sorted_y = df.sort_values(by='location_longitude').copy()
    unique_coords = df[['location_latitude', 'location_longitude']].drop_duplicates()
    neighbor_data = []

    for _, row in unique_coords.iterrows():
        x, y = row['location_latitude'], row['location_longitude']
        ids_within_threshold_x = get_ids_within_threshold(df, x, step_lat, axis='location_latitude')
        ids_within_threshold_y = get_ids_within_threshold(df, y, step_lon, axis='location_longitude')
        
        common_ids = list(set(ids_within_threshold_x) & set(ids_within_threshold_y))
        cgi_id = df[(df['location_latitude'] == x) & (df['location_longitude'] == y)]['cgi_id'].unique().tolist()
        difference_ids = list(set(common_ids) - set(cgi_id))
        neighbor_data.append({'location_latitude':x,'location_longitude':y,'neighbor_ids': difference_ids,'number_of_sectors':len(difference_ids)})
        # print(f"Remaining coordinates: {len(unique_coords) - len(neighbor_data)}")

    neighbor_df = pd.DataFrame(neighbor_data)
    return neighbor_df


def calculate_sectors_within_radius_single_point(latitude, longitude, radius, df):
    """
    Calculate the number of sectors within a given radius around a single point.
    
    Parameters:
    - latitude: float, latitude of the point
    - longitude: float, longitude of the point
    - radius: float, the radius in kilometers
    - df: pandas DataFrame containing 'location_latitude' and 'location_longitude' columns
    
    Returns:
    - int, number of sectors within the specified radius around the given point
    """
    sector_coordinates = np.column_stack((df['location_latitude'], df['location_longitude']))
    distances = np.array([geodesic((latitude, longitude), coord).kilometers for coord in sector_coordinates])
    is_within_radius = distances <= radius  # Boolean array indicating if each sector is within radius
    num_sectors_within_radius = np.sum(is_within_radius) - 1  # Exclude the sector itself
    return num_sectors_within_radius

def calculate_sectors_within_radius(df, radius):
    """
    Calculate the number of sectors within a given radius around each sector in the DataFrame.
    
    Parameters:
    - df: pandas DataFrame containing 'location_latitude' and 'location_longitude' columns
    - radius: float, the radius in kilometers
    
    Returns:
    - pandas Series, number of sectors within the specified radius for each row in df
    """
    # num_sectors_within_radius = []

    # for index, row in df.iterrows():
    #     num_sectors = calculate_sectors_within_radius_single_point(row['location_latitude'], row['location_longitude'], radius, df)
    #     num_sectors_within_radius.append(num_sectors)

    # df["number_of_sectors_in_{}km_radius".format(radius)] = pd.Series(num_sectors_within_radius, index=df.index)
    # return df["number_of_sectors_in_{}km_radius".format(radius)]
    
    # Define a function to calculate sectors within radius for each row
    def calculate_sectors(row):
        return calculate_sectors_within_radius_single_point(row['location_latitude'], row['location_longitude'], radius, df)
    
    # Apply the function to each row in the DataFrame
    num_sectors_within_radius = df.apply(calculate_sectors, axis=1)
    # Assign the result to a new column in the DataFrame
    df["number_of_sectors"] = num_sectors_within_radius
    
    # Return the calculated Series
    return num_sectors_within_radius


import pandas as pd
from geopy.distance import geodesic

def determine_correct_cgi_subset(df, start_idx=0, end_idx=30):
    # Convert START_DATE to datetime and sort by it
    df['START_DATE'] = pd.to_datetime(df['START_DATE'])
    df.sort_values('START_DATE', inplace=True)
    
    # Initialize variables
    switching_pairs = []
    last_cgi_id = None
    current_switching = []
    result = []
    switch_start_idx = None

    for idx, row in df[start_idx:end_idx].iterrows():
        current_cgi_id = row['cgi_id']

        # Check for switching pattern
        if last_cgi_id is not None and current_cgi_id != last_cgi_id:
            if len(current_switching) >= 2 and current_switching[-2] == current_cgi_id:
                current_switching.append(current_cgi_id)
                switching_pairs.append((current_switching, switch_start_idx, idx))
                current_switching = []
                switch_start_idx = None
            else:
                current_switching = [last_cgi_id, current_cgi_id]
                if switch_start_idx is None:
                    switch_start_idx = idx - 1  # Start from the previous index where the switch began
        
        last_cgi_id = current_cgi_id

    # Iterate through detected switching pairs
    for switch_pair, start_idx, stop_idx in switching_pairs:
        switch_cgi_ids = list(set(switch_pair))
        next_unique_cgi_ids = df.iloc[stop_idx + 1:]['cgi_id'].unique()[:4]
        print(switch_pair,'==========', next_unique_cgi_ids)
        if len(next_unique_cgi_ids) < 2:
            continue

        # Calculate distances
        distances = {cgi_id: 0 for cgi_id in switch_cgi_ids}
        print(distances)
        for cgi_id in switch_cgi_ids:
            cgi_coords = df[df['cgi_id'] == cgi_id][['location_latitude', 'location_longitude']].iloc[0]
            cgi_point = (cgi_coords['location_latitude'], cgi_coords['location_longitude'])

            total_distance = 0
            for next_cgi_id in next_unique_cgi_ids:
                next_cgi_coords = df[df['cgi_id'] == next_cgi_id][['location_latitude', 'location_longitude']].iloc[0]
                next_cgi_point = (next_cgi_coords['location_latitude'], next_cgi_coords['location_longitude'])
                total_distance += geodesic(cgi_point, next_cgi_point).meters
            
            distances[cgi_id] = total_distance / len(next_unique_cgi_ids)
            # print(distances)
        print(distances)
        # Determine the correct CGI ID
        correct_cgi_id = min(distances, key=distances.get)
        print(correct_cgi_id)
        overshoot_cgi_id = [cgi_id for cgi_id in switch_cgi_ids if cgi_id != correct_cgi_id][0]
        
        # df["potential_overshoot"] = False
        
        # Update the overshoot_flag column for relevant rows only
        switch_rows = df.index[start_idx:stop_idx + 1]
        df.loc[switch_rows[df.loc[switch_rows, 'cgi_id'] == correct_cgi_id], 'potential_overshoot'] = False
        df.loc[switch_rows[df.loc[switch_rows, 'cgi_id'] == overshoot_cgi_id], 'potential_overshoot'] = True
        # print(start_idx, "___________", stop_idx + 1)
        result.append((stop_idx, correct_cgi_id))

    return df




# def flag_overshoots_based_on_previous_sectors(df, radius, sector_threshold, time_threshold, num_sectors_column=None,  start_idx=0):
#     """
#     Flags current points as overshoots based on the number of sectors surrounding the previous point that was not an Overshoot.

#     Parameters:
#     - df: pandas DataFrame
#     - radius: float, the radius in kilometers to use for calculating surrounding sectors
#     - threshold: int or float, minimum threshold for flagging
#     - num_sectors_column: str, name of the column representing the number of sectors within radius (optional)

#     Returns:
#     - pandas Series of boolean values indicating overshoots
#     """
#     if num_sectors_column is None:
#         num_sectors_within_radius = calculate_sectors_within_radius(df, radius)
#     else:
#         num_sectors_within_radius = df[num_sectors_column]

    
#     df["potential_overshoot"] = False
    
#     end_idx = 12
#     df = determine_correct_cgi_subset(df, start_idx=0, end_idx=end_idx)
#     # print(df.head(10))
#     start_idx = end_idx+1
    
    
#     ## using apply instead of a for loop
#     # def calculate_flag(row):
#     #     i = row.name
#     #     j = row.name - 1
#     #     while j >= 0 and overshoot_flags.iloc[j] == True:
#     #         j -= 1
#     #     if j >= 0:
#     #     # distance = haversine_numpy(df.at[i, 'location_latitude'], df.at[i, 'location_longitude'],
#     #     #                     df.at[j, 'location_latitude'], df.at[j, 'location_longitude'])
#     #         distance = geodesic((df.at[i, 'location_latitude'], df.at[i, 'location_longitude']),
#     #                             (df.at[j, 'location_latitude'], df.at[j, 'location_longitude'])).kilometers
#     #         time_diff = (pd.to_datetime(df.at[i, 'START_DATE']) - pd.to_datetime(df.at[j, 'START_DATE'])).total_seconds() / 3600.0  # Calculate time difference in hours
#     #         print("time: ",time_diff,"      distance:  ",distance )
#     #         if distance < radius or time_diff>time_threshold:
#     #             print("False:",i, "    <--i, j--> ", j)
#     #             return False
#     #         else:
#     #             print("Depends on number of sectors",i, "    <--i, j--> ", j)
#     #             return num_sectors_within_radius.iloc[j] >= sector_threshold
#     #     else:
#     #         return False
#     # overshoot_flags.iloc[1:] = df.iloc[1:].apply(calculate_flag, axis=1)
#     # print(overshoot_flags)
#     # return overshoot_flags
#     overshoot_flags = pd.Series(index=df.index, dtype=bool)
#     overshoot_flags[:]= False  # Initialize the first point's flag as False
#     overshoot_flags.iloc[end_idx] = df.loc[end_idx,"potential_overshoot"]
#     if "START_DATE" in df.columns:  
#         for i in range(start_idx, len(df)):
#             j = i - 1
#             while j >= 0 and overshoot_flags.iloc[j]:
#                 j -= 1
#             distance = geodesic((df.at[i, 'location_latitude'], df.at[i, 'location_longitude']),
#                                     (df.at[j, 'location_latitude'], df.at[j, 'location_longitude'])).kilometers
#             time_diff = (pd.to_datetime(df.at[i, 'START_DATE']) - pd.to_datetime(df.at[j, 'START_DATE'])).total_seconds()/ 3600.0  # Calculate time difference in hours
#             # print("time: ",time_diff,"      distance:  ",distance )
#             if distance < radius or time_diff>time_threshold:
#                 # print("False:",i, "    <--i, j--> ", j)
#                 overshoot_flags.iloc[i] =False
#             else:
#                 # print("Depends on number of sectors",i, "    <--i, j--> ", j)
#                 overshoot_flags.iloc[i] = num_sectors_within_radius.iloc[j] >= sector_threshold
        
#         # Assign overshoot_flags back to the DataFrame
#         df.loc[start_idx:, "potential_overshoot"] = overshoot_flags[start_idx:]
#         # return overshoot_flags
    
#     elif "usage_timeframe" in df.columns: 
#         for i in range(start_idx, len(df)):
#             j = i - 1
#             while j >= 0 and overshoot_flags.iloc[j]:
#                 j -= 1
#             distance = geodesic((df.at[i, 'location_latitude'], df.at[i, 'location_longitude']),
#                                     (df.at[j, 'location_latitude'], df.at[j, 'location_longitude'])).kilometers
#             time_diff = (pd.to_datetime(df.at[i, 'usage_timeframe']) - pd.to_datetime(df.at[j, 'usage_timeframe'])).total_seconds() / 3600.0  # Calculate time difference in hours
#             # print("time: ",time_diff,"      distance:  ",distance )
            
#             if distance < radius:
#                 # print("False:",i, "    <--i, j--> ", j)
#                 overshoot_flags.iloc[i] =False
#             elif time_diff>time_threshold:
#                 next_idx = determine_correct_cgi_subset(df,i)
#             else:
#                 # print("Depends on number of sectors",i, "    <--i, j--> ", j)
#                 overshoot_flags.iloc[i] = num_sectors_within_radius.iloc[j] >= sector_threshold
        
#     # Assign overshoot_flags back to the DataFrame
#     df.loc[start_idx:, "potential_overshoot"] = overshoot_flags[start_idx:]
        
    
from math import radians, sin, cos, sqrt, atan2



In [3]:
import math
import pandas as pd
from geopy.distance import geodesic

def determine_correct_cgi_subset(df, start_idx=0, end_idx=len(df)-1):

    if start_idx<len(df)-1 and df.at[start_idx, 'cgi_id'] != df.at[start_idx + 1, 'cgi_id']:
        switching_pattern = {df.at[start_idx ,'cgi_id']:start_idx, df.at[start_idx+1, 'cgi_id']:start_idx+1 }
    else:
        return df ,start_idx
    # Identify the switching pattern
    switch_end_idx= start_idx+1
    for i in range(start_idx+2, end_idx-1):
        if i + 1 < len(df) and df.at[i, 'cgi_id'] != df.at[i + 1, 'cgi_id']:
            # print(df.at[i, 'cgi_id'],"---", df.at[i + 1, 'cgi_id'])
            if (df.at[i,"cgi_id"] in switching_pattern.keys()) and (df.at[i+1,"cgi_id"] in switching_pattern.keys()):
                
                switching_pattern[df.at[i ,'cgi_id']] = i
                switching_pattern[df.at[i+1 ,'cgi_id']] = i+1
                switch_end_idx = i+1
            else:
            
                break
        else:
            break
    
    if not switching_pattern or switch_end_idx == start_idx+1:
            return df ,start_idx
        
    print(switching_pattern)

    cgi_set = switching_pattern.keys()

    
    # if len(cgi_set) != 2:
    #     return df, end_idx
    
    cgi1, cgi2 = list(cgi_set)
    print(cgi1,"_____________",cgi2)
    next_unique_cgis = set()
    for i in range(switch_end_idx + 1, len(df)):
        if df.at[i, 'cgi_id'] not in cgi_set:
            next_unique_cgis.add(df.at[i, 'cgi_id'])
        else:
            continue
        if len(next_unique_cgis) == 4:
            break
    
    # if len(next_unique_cgis) < 2:
    #     return df, end_idx
    
    next_unique_cgis = list(next_unique_cgis)
    
    def calculate_distance(cgi_id, next_cgi_id):
        row_cgi = df[df['cgi_id'] == cgi_id].iloc[0]
        row_next_cgi = df[df['cgi_id'] == next_cgi_id].iloc[0]
        return geodesic(
            (row_cgi['location_latitude'], row_cgi['location_longitude']),
            (row_next_cgi['location_latitude'], row_next_cgi['location_longitude'])
        ).kilometers
    
    # Calculate average distance for cgi1
    distances_cgi1 = [calculate_distance(cgi1, next_cgi) for next_cgi in next_unique_cgis]
    if distances_cgi1:
        avg_distance_cgi1 = sum(distances_cgi1) / len(distances_cgi1)
    else:
        avg_distance_cgi1 = 0  # handle case where no distances are calculated

    # Calculate average distance for cgi2
    distances_cgi2 = [calculate_distance(cgi2, next_cgi) for next_cgi in next_unique_cgis]
    if distances_cgi2:
        avg_distance_cgi2 = sum(distances_cgi2) / len(distances_cgi2)
    else:
        avg_distance_cgi2 = 0  # handle case where no distances are calculated
    correct_cgi = cgi1 if avg_distance_cgi1 < avg_distance_cgi2 else cgi2
    overshoot_cgi = cgi2 if correct_cgi == cgi1 else cgi1
    
    df.loc[start_idx:switch_end_idx, 'switching_overshoot'] = (df.loc[start_idx:switch_end_idx, 'cgi_id'] == overshoot_cgi)
    
    return df, switch_end_idx

def flag_overshoots_based_on_previous_sectors(df,  sector_threshold, time_threshold, num_sectors_column=None,radius=2000, start_idx=1):
    
    if num_sectors_column is not None:
       num_sectors_within_radius = df[num_sectors_column]
    elif 'number_of_sectors'  in df.columns:
        num_sectors_within_radius = df["number_of_sectors"]
    else:
         # num_sectors_within_radius = calculate_sectors_within_radius(df, radius)
        num__of_sectors = get_common_ids(df,radius)
        temp = df.merge(num__of_sectors, on=['location_latitude', 'location_longitude'], how='inner').sort_values("START_DATE").reset_index(drop=True)
        num_sectors_within_radius= temp["number_of_sectors"]
        df["number_of_sectors"]= num_sectors_within_radius  
    
    df.loc[0,"potential_overshoot"] = False
    df.loc[0,"switching_overshoot"] = False

    # print(num_sectors_within_radius)


    
# while start_idx < len(df):
    print("loop 1")
    for i in range(1, len(df)):
        # start_idx = i
        j = i - 1
        while j >= 0 and df.at[j, 'potential_overshoot']:
            j -= 1
        distance = geodesic(
            (df.at[i, 'location_latitude'], df.at[i, 'location_longitude']),
            (df.at[j, 'location_latitude'], df.at[j, 'location_longitude'])
        ).meters
        time_diff = (pd.to_datetime(df.at[i, 'START_DATE']) - pd.to_datetime(df.at[j, 'START_DATE'])).total_seconds() / 3600.0
      
        if distance < radius:
            df.at[i, 'potential_overshoot'] = False    
        # elif time_diff > time_threshold:
        #     df.at[i, 'potential_overshoot'] = False
            ## the statements below are related to switching 
            # df, switch_end_idx = determine_correct_cgi_subset(df, start_idx=i)
            # if switch_end_idx == i:
            #     df.at[i, 'switching_overshoot'] = False
            # else:
            #     start_idx = switch_end_idx 
            #     break
        else:
            df.at[i, 'potential_overshoot'] = num_sectors_within_radius.iloc[j] >= sector_threshold
    # df.loc[start_idx:, "potential_overshoot"] = overshoot_flags[start_idx:]
    # start_idx += 1

    
    print("loop2 ") 
    while start_idx < len(df):
        print("start index: ",start_idx)
        for i in range(start_idx, len(df)):
            start_idx = i
            j = i - 1
            while j >= 0 and df.at[j, 'potential_overshoot']:
                j -= 1
            print("i: ",i,"-----------","j:",j)
            
            distance = geodesic(
                (df.at[i, 'location_latitude'], df.at[i, 'location_longitude']),
                (df.at[j, 'location_latitude'], df.at[j, 'location_longitude'])
            ).meters
        # if i!=1:
        #     time_diff = (pd.to_datetime(df.at[i, 'START_DATE']) - pd.to_datetime(df.at[j, 'START_DATE'])).total_seconds() / 3600.0
                
        # else: 
        #     time_diff = math.inf  
        
            if distance < radius:
                break
        # if time_diff > time_threshold:
            else:
                df, switch_end_idx = determine_correct_cgi_subset(df, start_idx=i)
                if switch_end_idx == i:
                    df.at[i, 'switching_overshoot'] = False
                else:
                    start_idx = switch_end_idx 
                    break     
        start_idx += 1
        df.loc[df['potential_overshoot'] == False, 'switching_overshoot'] = False

        
    return df



In [204]:
## Method 2:
radius = 2  # 1 kilometer  
sector_threshold = 5 # sectors
time_threshold =1 #hour


# intermediary = pd.merge(CDR, bts_towers, left_on="CELL_ID_START",right_on='cgi_id' )

# intermediary= intermediary[['cgi_id','location_latitude','location_longitude','location_azimuth']]
# intermediary = intermediary.drop_duplicates(subset=["cgi_id",'location_latitude','location_longitude','location_azimuth'])
# calculate_sectors_within_radius(intermediary,radius)
# intermediary

intermediary = df.drop_duplicates(subset=['cgi_id','location_latitude','location_longitude','location_azimuth'])
calculate_sectors_within_radius(intermediary,radius)
# intermediary

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["number_of_sectors"] = num_sectors_within_radius


0         14
48       169
324       25
326       14
328       32
        ... 
25501    145
25532    145
25951    171
29458    169
29849    185
Length: 336, dtype: int64

In [4]:
import pandas as pd
from datetime import timedelta

def calculate_time_spent_so_far(df):
    # df = df.sort_values(by='START_DATE').reset_index(drop=True)
    df['time_spent_so_far'] = 0
    if "START_DATE" in df.columns:
        df['START_DATE'] = pd.to_datetime(df["START_DATE"])
        previous_cgi_id =df.loc[0, 'cgi_id']
        accumulated_time = 0

        for i in range(1, len(df)):
            current_cgi_id = df.loc[i, 'cgi_id']
            previous_start_date = df.loc[i-1, 'START_DATE']
            current_start_date = df.loc[i, 'START_DATE']
            
            # print(f"Processing row {i}:")
            # print(f"  Previous CGI ID: {previous_cgi_id}, Current CGI ID: {current_cgi_id}")
            # print(f"  Previous Start Date: {previous_start_date}, Current Start Date: {current_start_date}")
            
            if current_cgi_id == previous_cgi_id:
                time_diff = current_start_date - previous_start_date
                accumulated_time += time_diff
                # print(f"  Accumulated Time: {accumulated_time}")
            else:
                accumulated_time = 0
                # print("  Resetting accumulated time.")

            df.loc[i, 'time_spent_so_far'] = accumulated_time
            # print(f"  Time Spent So Far (seconds): {df.loc[i, 'time_spent_so_far']}")
            
            previous_cgi_id = current_cgi_id
            

        return df
    else:    
        previous_cgi_id =df.loc[0, 'cgi_id']
        accumulated_time = 0

        for i in range(1, len(df)):
            current_cgi_id = df.loc[i, 'cgi_id']
            previous_start_date = df.loc[i-1, 'usage_timeframe']
            current_start_date = df.loc[i, 'usage_timeframe']
            
            # print(f"Processing row {i}:")
            # print(f"  Previous CGI ID: {previous_cgi_id}, Current CGI ID: {current_cgi_id}")
            # print(f"  Previous Start Date: {previous_start_date}, Current Start Date: {current_start_date}")
            
            if current_cgi_id == previous_cgi_id:
                time_diff = current_start_date - previous_start_date
                accumulated_time += time_diff
                # print(f"  Accumulated Time: {accumulated_time}")
            else:
                accumulated_time = 0
                # print("  Resetting accumulated time.")

            df.loc[i, 'time_spent_so_far'] = accumulated_time
            # print(f"  Time Spent So Far (seconds): {df.loc[i, 'time_spent_so_far']}")
            
            previous_cgi_id = current_cgi_id
            

        return df
    
def generate_time_spent_flag(df, threshold_minutes):
    df = calculate_time_spent_so_far(df)
    threshold_seconds = threshold_minutes * 60  # Convert minutes to seconds
    df['time_spent_flag'] = df['time_spent_so_far'] > threshold_seconds
    return df


def generate_visited_flag(df, time_window):
    df['potential_session_end'] = False
    
    # Dictionary to keep track of the last visit time for each cgi_id
    last_visit_time = {}
    
    for index, row in df.iterrows():
        current_time = row['END_DATE']
        
        if row['potential_overshoot'] == False:
            df.at[index, 'potential_session_end'] = None
        else:
            cgi_id = row['cgi_id']
            
            if cgi_id in last_visit_time:
                last_time = last_visit_time[cgi_id]
                time_diff = current_time - last_time
                
                if time_diff >= timedelta(minutes=time_window):
                    df.at[index, 'potential_session_end'] = True
                # else:
                #     df.at[index, 'potential_session_end'] = False
            else:
                df.at[index, 'potential_session_end'] = False
            
            # Update the last visit time for this cgi_id
            last_visit_time[cgi_id] = current_time
    
    return df
def generate_duration_at_bts_flag(df, threshold_minutes):
    
    df['potential_newly_active_location'] = ((df['Duration(s)'] > (threshold_minutes * 60)) & (df['Duration(s)'] < (24*3600))& df["potential_overshoot"])
    return df   


In [6]:
import pandas as pd

# Assuming df is your initial DataFrame with the provided data
# data = {
#     'imsi_id': [151967466595180, 151967466595180, 151967466595180, 151967466595180, 151967466595180, 
#                 151967466595180, 151967466595180, 151967466595180, 151967466595180, 151967466595180, 
#                 151967466595180, 151967466595180, 151967466595180, 151967466595180],
#     'usage_timeframe': [1704810537714, 1704810537715, 1704810537946, 1704810540812, 1704810540813, 
#                         1704810540880, 1704810546534, 1704810546535, 1704810546618, 1704810580753, 
#                         1704810606973, 1704810606974, 1704810620192, 1704810647775],
#     'cgi_id': [10570761, 10570761, 10570761, 10540807, 10540807, 10540807, 10570761, 10570761, 
#                10570761, 10570761, 10517510, 10517510, 10517510, 10517510],
#     'imei_id': [509593926863172, 509593926863172, 509593926863172, 509593926863172, 509593926863172, 
#                 509593926863172, 509593926863172, 509593926863172, 509593926863172, 509593926863172, 
#                 509593926863172, 509593926863172, 509593926863172, 509593926863172],
#     'location_azimuth': [10, 10, 10, 140, 140, 140, 10, 10, 10, 10, 30, 30, 30, 30],
#     'location_latitude': [33.8693, 33.8693, 33.8693, 33.8748, 33.8748, 33.8748, 33.8693, 33.8693, 
#                           33.8693, 33.8693, 33.8634, 33.8634, 33.8634, 33.8634],
#     'location_longitude': [35.5124, 35.5124, 35.5124, 35.5055, 35.5055, 35.5055, 35.5124, 35.5124, 
#                            35.5124, 35.5124, 35.5047, 35.5047, 35.5047, 35.5047]
# }

# df = pd.DataFrame(data)

# Convert usage_timeframe to datetime format
df['usage_timeframe'] = pd.to_datetime(df['usage_timeframe'], unit='ms')

# Find consecutive sessions based on changes in cgi_id
df['session_id'] = (df['cgi_id'] != df['cgi_id'].shift(1)).cumsum()

# Group by session_id and aggregate
result_df = df.groupby('session_id').agg({
    # 'session_id':'first',
    'imsi_id': 'first',
    'cgi_id': 'first',
    'imei_id': 'first',
    'location_azimuth': 'first',
    'location_latitude': 'first',
    'location_longitude': 'first',
    'usage_timeframe': ['min', 'max', lambda x: (x.max() - x.min()).total_seconds()]
}).reset_index(drop=True)

# Flatten multi-index columns
result_df.columns = ['imsi_id', 'cgi_id', 'imei_id', 'location_azimuth', 'location_latitude', 
                     'location_longitude', 'START_DATE', 'END_DATE', 'Duration(s)']
print
result_df= result_df.sort_values("START_DATE")

In [2]:
radius = 2000  # 2000 meters 
sector_threshold = 5 # sectors
time_threshold =1 #hour
# df.drop_duplicates(subset=["location_latitude","location_longitude"])
result_df = flag_overshoots_based_on_previous_sectors(result_df,sector_threshold=sector_threshold,time_threshold=time_threshold,radius=radius)

NameError: name 'flag_overshoots_based_on_previous_sectors' is not defined

In [8]:
result_df

Unnamed: 0,imsi_id,cgi_id,imei_id,location_azimuth,location_latitude,location_longitude,START_DATE,END_DATE,Duration(s),number_of_sectors,potential_overshoot,switching_overshoot
0,151967466595180,10521857,509593926863172,60,33.8901,35.5566,2024-01-09 10:43:33.785,2024-01-09 11:00:27.564,1013.779,10,False,False
1,151967466595180,10615559,509593926863172,210,33.8815,35.4886,2024-01-09 11:00:46.581,2024-01-09 11:00:46.581,0.000,174,True,True
2,151967466595180,10521857,509593926863172,60,33.8901,35.5566,2024-01-09 11:00:48.013,2024-01-09 11:25:41.748,1493.735,10,False,False
3,151967466595180,10615559,509593926863172,210,33.8815,35.4886,2024-01-09 11:26:00.856,2024-01-09 11:26:00.856,0.000,174,True,True
4,151967466595180,10521857,509593926863172,60,33.8901,35.5566,2024-01-09 11:26:02.840,2024-01-09 11:30:15.899,253.059,10,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
4190,151967466595180,10517505,509593926863172,30,33.8634,35.5047,2024-01-18 07:36:24.810,2024-01-18 07:36:37.781,12.971,120,False,False
4191,151967466595180,10570756,509593926863172,10,33.8693,35.5124,2024-01-18 07:36:54.156,2024-01-18 07:37:18.598,24.442,167,False,False
4192,151967466595180,10556674,509593926863172,170,33.8737,35.5149,2024-01-18 07:37:54.833,2024-01-18 07:37:54.833,0.000,151,False,False
4193,151967466595180,10570753,509593926863172,110,33.8693,35.5124,2024-01-18 07:37:58.949,2024-01-18 07:38:37.854,38.905,167,False,False


In [196]:
pd.set_option("display.max_rows",100)


In [None]:

# sector_threshold = 5 # sectors
# time_threshold =1 #hour

# joined_results = pd.merge(result_df, intermediary , on="cgi_id")
# joined_results.drop([col for col in joined_results.columns if '_y' in col], axis=1, inplace=True)
# # Rename columns to remove suffixes
# joined_results.columns = joined_results.columns.str.replace('_x', '')

# joined_results= joined_results.sort_values("START_DATE").reset_index(drop=True)
# #  df = df[500:900].reset_index()
# flag_overshoots_based_on_previous_sectors(joined_results, sector_threshold, time_threshold,num_sectors_column="number_of_sectors",radius=2000)
# # joined_results.head(40)


In [243]:
joined_results = generate_visited_flag(result_df,time_window=30)
# joined_results

In [244]:

time_spent_threshold = 30 #minutes

# Call the function
joined_results = generate_duration_at_bts_flag(joined_results, threshold_minutes=30)
  


In [1]:
from vcis.databases.cassandra.cassandra_tools import CassandraTools
from vcis.databases.cassandra_spark.cassandra_spark_tools import CassandraSparkTools
from vcis.session_detection.session_detection_main import SessionDetectionMain

detector = SessionDetectionMain()
radius = 2000  # 2000 meters  
sector_threshold = 5 # sectors
time_difference_threshold =600 # 10 hours
new_active_location_threshold = 180 #minutes
time_window = 180 #minutes
start_date = '2022-05-01'
end_date = '2025-05-01'
default_fetch_size = 30000
df = detector.generate_overshoot_flags("151967466595180","10.1.10.110",start_date,end_date,default_fetch_size,radius,sector_threshold,time_difference_threshold,time_window,new_active_location_threshold)
df

Unnamed: 0,imsi_id,cgi_id,imei_id,location_azimuth,location_latitude,location_longitude,START_DATE,END_DATE,Duration(s),number_of_sectors,potential_overshoot,switching_overshoot,potential_session_end,potential_newly_active_location,Overshoot
0,151967466595180,10521857,509593926863172,60,33.8901,35.5566,2024-01-09 10:43:33.785,2024-01-09 11:00:27.564,1013.779,10,False,False,,False,False
1,151967466595180,10615559,509593926863172,210,33.8815,35.4886,2024-01-09 11:00:46.581,2024-01-09 11:00:46.581,0.000,174,True,True,False,False,True
2,151967466595180,10521857,509593926863172,60,33.8901,35.5566,2024-01-09 11:00:48.013,2024-01-09 11:25:41.748,1493.735,10,False,False,,False,False
3,151967466595180,10615559,509593926863172,210,33.8815,35.4886,2024-01-09 11:26:00.856,2024-01-09 11:26:00.856,0.000,174,True,True,False,False,True
4,151967466595180,10521857,509593926863172,60,33.8901,35.5566,2024-01-09 11:26:02.840,2024-01-09 11:30:15.899,253.059,10,False,False,,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4190,151967466595180,10517505,509593926863172,30,33.8634,35.5047,2024-01-18 07:36:24.810,2024-01-18 07:36:37.781,12.971,120,False,False,,False,False
4191,151967466595180,10570756,509593926863172,10,33.8693,35.5124,2024-01-18 07:36:54.156,2024-01-18 07:37:18.598,24.442,167,False,False,,False,False
4192,151967466595180,10556674,509593926863172,170,33.8737,35.5149,2024-01-18 07:37:54.833,2024-01-18 07:37:54.833,0.000,151,False,False,,False,False
4193,151967466595180,10570753,509593926863172,110,33.8693,35.5124,2024-01-18 07:37:58.949,2024-01-18 07:38:37.854,38.905,167,False,False,,False,False


In [2]:
df

In [2]:
print(df["potential_session_end"].value_counts())
print(df["potential_overshoot"].value_counts())
print(df["potential_newly_active_location"].value_counts())
print(df["switching_overshoot"].value_counts())
print(df["Overshoot"].value_counts())
# #Before:
# False    124
# True      31
# Name: potential_session_end, dtype: int64
# False    4040
# True      155
# Name: potential_overshoot, dtype: int64
# False    4194
# True        1
# Name: potential_newly_active_location, dtype: int64
# False    4108
# True       87
# Name: switching_overshoot, dtype: int64
# False    4041
# True      154
# Name: Overshoot, dtype: int64
# #After:


False    120
True      18
Name: potential_session_end, dtype: int64
False    4057
True      138
Name: potential_overshoot, dtype: int64
False    4195
Name: potential_newly_active_location, dtype: int64
False    4104
True       91
Name: switching_overshoot, dtype: int64
False    4057
True      138
Name: Overshoot, dtype: int64


In [4]:
df

Unnamed: 0,imsi_id,cgi_id,imei_id,location_azimuth,location_latitude,location_longitude,START_DATE,END_DATE,Duration(s),number_of_sectors,potential_overshoot,switching_overshoot
0,151967466595180,10521857,509593926863172,60,33.8901,35.5566,2024-01-09 10:43:33.785,2024-01-09 11:00:27.564,1013.779,10,False,False
1,151967466595180,10615559,509593926863172,210,33.8815,35.4886,2024-01-09 11:00:46.581,2024-01-09 11:00:46.581,0.000,174,True,True
2,151967466595180,10521857,509593926863172,60,33.8901,35.5566,2024-01-09 11:00:48.013,2024-01-09 11:25:41.748,1493.735,10,False,False
3,151967466595180,10615559,509593926863172,210,33.8815,35.4886,2024-01-09 11:26:00.856,2024-01-09 11:26:00.856,0.000,174,True,True
4,151967466595180,10521857,509593926863172,60,33.8901,35.5566,2024-01-09 11:26:02.840,2024-01-09 11:30:15.899,253.059,10,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
4190,151967466595180,10517505,509593926863172,30,33.8634,35.5047,2024-01-18 07:36:24.810,2024-01-18 07:36:37.781,12.971,120,False,False
4191,151967466595180,10570756,509593926863172,10,33.8693,35.5124,2024-01-18 07:36:54.156,2024-01-18 07:37:18.598,24.442,167,False,False
4192,151967466595180,10556674,509593926863172,170,33.8737,35.5149,2024-01-18 07:37:54.833,2024-01-18 07:37:54.833,0.000,151,False,False
4193,151967466595180,10570753,509593926863172,110,33.8693,35.5124,2024-01-18 07:37:58.949,2024-01-18 07:38:37.854,38.905,167,False,False


In [273]:
joined_results[ joined_results["potential_overshoot"]&  ((joined_results["potential_session_end"]==False)& (joined_results["potential_newly_active_location"]==True) & (joined_results["switching_overshoot"]==False))]

Unnamed: 0,imsi_id,cgi_id,imei_id,location_azimuth,location_latitude,location_longitude,START_DATE,END_DATE,Duration(s),number_of_sectors,potential_overshoot,switching_overshoot,potential_session_end,potential_newly_active_location,Overshoot


In [274]:

joined_results["Overshoot"]= "undetermined"
  
# Default to 'undetermined'
joined_results.loc[joined_results['potential_overshoot'] == False, 'Overshoot'] = False  
# If 'a' is False, set 'new_column' to False
joined_results.loc[joined_results['potential_overshoot'] & (joined_results['potential_session_end'] | joined_results['switching_overshoot'] | (joined_results['potential_newly_active_location'] ==False)), 'Overshoot'] = True
joined_results.loc[joined_results['potential_overshoot'] & (joined_results['potential_newly_active_location']), 'Overshoot'] = False  # If 'a' is True and 'd' is False, set 'Overshoot' to False
 

In [63]:
hover_cols = ['cgi_id','potential_session_end','potential_newly_active_location',"START_DATE"]

plot_trace(df[4050:4070].reset_index(drop=True),hover_cols=hover_cols)


In [62]:
df[df["potential_overshoot"] | df["switching_overshoot"]][120:].head(50)

Unnamed: 0,imsi_id,cgi_id,imei_id,location_azimuth,location_latitude,location_longitude,START_DATE,END_DATE,Duration(s),number_of_sectors,potential_overshoot,switching_overshoot,potential_session_end,potential_newly_active_location,Overshoot
3726,151967466595180,10532614,509593926863172,50,33.8838,35.492,2024-01-17 13:18:44.611,2024-01-17 13:18:44.611,0.0,175,True,True,False,False,True
3773,151967466595180,10584071,509593926863172,155,33.8948,35.505,2024-01-17 14:32:37.860,2024-01-17 14:32:37.860,0.0,170,True,False,False,False,True
3866,151967466595180,10532614,509593926863172,50,33.8838,35.492,2024-01-17 15:15:16.721,2024-01-17 15:15:16.721,0.0,175,True,True,False,False,True
3868,151967466595180,10532614,509593926863172,50,33.8838,35.492,2024-01-17 15:17:19.581,2024-01-17 15:17:19.581,0.0,175,True,True,False,False,True
3870,151967466595180,10532614,509593926863172,50,33.8838,35.492,2024-01-17 15:21:42.631,2024-01-17 15:21:42.631,0.0,175,True,True,False,False,True
3872,151967466595180,10532614,509593926863172,50,33.8838,35.492,2024-01-17 15:22:08.874,2024-01-17 15:22:08.874,0.0,175,True,True,False,False,True
3907,151967466595180,10532614,509593926863172,50,33.8838,35.492,2024-01-17 15:47:18.159,2024-01-17 15:47:18.159,0.0,175,True,False,False,False,True
3931,151967466595180,10532614,509593926863172,50,33.8838,35.492,2024-01-17 16:02:43.769,2024-01-17 16:02:43.769,0.0,175,True,False,False,False,True
3936,151967466595180,10532614,509593926863172,50,33.8838,35.492,2024-01-17 16:11:52.533,2024-01-17 16:11:52.533,0.0,175,True,True,False,False,True
3938,151967466595180,10532614,509593926863172,50,33.8838,35.492,2024-01-17 16:26:39.582,2024-01-17 16:26:39.582,0.0,175,True,True,False,False,True


In [None]:
import math
import folium
import haversine

def get_correlation_plot_folium(df):
        # df_geo = self.cassandra_tools.get_device_history_geo(device_id= geo_id ,start_date= start_date ,end_date= end_date,server = server)
        # df_imsi = self.cassandra_tools.get_device_history_imsi(imsi_id= imsi_id , start_date= start_date , end_date= end_date,server= server)
        m = folium.Map(location=[df['location_latitude'][0],df['location_longitude'][0]], zoom_start=15)
        
        # df_geo = self.utils.convert_ms_to_datetime(df_geo)
        
        df = df.dropna(subset=['location_latitude','location_longitude','location_azimuth'])
        for index , row in df.iterrows():
            latitude = row['location_latitude']
            longitude = row['location_longitude']
            if not row['location_azimuth'].isdigit():
                continue
            azimuth=int(row['location_azimuth'])
            popup = row['START_DATE']
            triangle_coordinates = calculate_sector_triangle(latitude, longitude, azimuth)
            folium.Polygon(locations=triangle_coordinates, color= "red" if row["OvershootFlag"] else "green", fill=True,  fill_opacity=0.1).add_to(m)
            folium.CircleMarker([latitude, longitude],radius=1,color='red',popup=popup).add_to(m)
            
        # for index , row in df_geo.iterrows():
        #     latitude = row['location_latitude']
        #     longitude = row['location_longitude']
        #     popup = row['usage_timeframe']
        #     folium.CircleMarker([latitude, longitude],radius=1,color='blue',popup=popup).add_to(m)
        # time = datetime.now()
        m.save( 'Correlation_map.html')
        return m
        
def calculate_coordinates( latitude, longitude, azimuth_deg , distance_km = 0.15 ,radius = 6371):
        earth_radius = radius

        azimuth_rad = math.radians(azimuth_deg)

        new_lat = math.degrees(math.asin(math.sin(math.radians(latitude)) * math.cos(distance_km / earth_radius) +
                                         math.cos(math.radians(latitude)) * math.sin(distance_km / earth_radius) *
                                         math.cos(azimuth_rad)))
        new_lon = longitude + math.degrees(math.atan2(math.sin(azimuth_rad) * math.sin(distance_km / earth_radius) * math.cos(math.radians(latitude)),
                                                           math.cos(distance_km / earth_radius) - math.sin(math.radians(latitude)) * math.sin(math.radians(new_lat))))
        return    new_lat,new_lon
    
def calculate_sector_triangle(latitude, longitude, azimuth, distance_km:float = 0.15):
    azimuth1 = (azimuth - 30) % 360
    azimuth2 = (azimuth + 30) % 360

    vertex1 = calculate_coordinates(latitude, longitude, azimuth1 )
    vertex2 = calculate_coordinates(latitude, longitude, azimuth2 )

    return [
        list([latitude,longitude]),
        list(vertex1),
        list(vertex2)
    ]

def find_circumcenter_radius(latitude, longitude, azimuth):
    vertices = calculate_sector_triangle(latitude, longitude, azimuth)
    if len(vertices) != 3:
        raise ValueError("The input list must contain exactly three vertices.")

    circumcenter_x = (vertices[0][0] + vertices[1][0] + vertices[2][0]) / 3
    circumcenter_y = (vertices[0][1] + vertices[1][1] + vertices[2][1]) / 3
    
    circumcenter = (circumcenter_x, circumcenter_y)
    radius = haversine(circumcenter[0], circumcenter[1], vertices[0][0], vertices[0][1])

    return circumcenter, radius
get_correlation_plot_folium(df)
