# Load CSV file containing duplicates

Input the API key first. Only 1 API key is allowed here

In [None]:
API_KEY = "Enter your API key here"

In [None]:
import pandas as pd
duplicates_df = pd.read_csv("Duplicate_removal/Duplicates.csv")

In [None]:
duplicates_df.tail()

Seperate the responses from combined response

In [None]:
new_columns = ['Report Type', 'Publish Date','Accident Date','Time of Accident','Killed','Injured','Location','Road Type', 'Pedestrian Involved', 'Vehicles Involced', 'District']  # Generate column names
# Split the "Gemini_responses" column and handle discrepancies
split_df = duplicates_df["LLM Response"].str.split("<sep>", expand=True)

# Ensure the resulting DataFrame has the same number of columns as `new_columns`
split_df = split_df.reindex(columns=range(len(new_columns)), fill_value="ERROR")

# Assign the split data to the new columns
duplicates_df[new_columns] = split_df

In [None]:
duplicates_df.tail()

Remove unwanted spaces from 'District' column

In [None]:
duplicates_df['District'] = duplicates_df['District'].str.replace('\n', ' ', regex=True).str.strip()
# duplicates_df = duplicates_df[duplicates_df.duplicated(subset=['Publish Date', 'Accident Date', 'District'], keep=False)]
# duplicates_df.head()

# Deduplication Algorithm #

In [None]:
import pandas as pd
import google.generativeai as genai

# Do NOT filter out unique news. Use the complete dataframe.
# duplicates_df = duplicates_df[duplicates_df.duplicated(subset=['Publish Date', 'District'], keep=False)]
# duplicates_df.head()

# Initialize the Gemini API with your API key
genai.configure(api_key=API_KEY)  # Replace with your actual API key

def call_gemini_api(base_title, base_description, candidate_texts):
    """
    Uses the Gemini API to check if candidate articles report the same accident.
    The prompt now includes both the news title and description.
    """
    base_news_text = f"News Title: {base_title}\nNews Description: {base_description}"
    
    # Build the prompt
    prompt = f"Base news:\n{base_news_text}\n\n"
    prompt += ("For each of the following news articles, determine if it reports the same accident "
               "as the base news. Answer 'True' if yes, and 'False' if not.\n\n")
    for idx, candidate in enumerate(candidate_texts, start=1):
        prompt += f"{idx}. {candidate}\n\n"
    
    # Create model instance
    model = genai.GenerativeModel('gemini-2.0-flash-exp')
    
    # Generate content
    response = model.generate_content(prompt)
    print(response.text)
    
    # Get the text response from the API
    answer_text = response.text
    
    # Parse the response to extract boolean answers
    answers = []
    for line in answer_text.splitlines():
        line = line.strip()
        if not line:
            continue
        if "True" in line:
            answers.append(True)
        elif "False" in line:
            answers.append(False)
    
    # Handle mismatched answer counts by assuming non-duplicates if the count is off
    if len(answers) != len(candidate_texts):
        print(f"Warning: Expected {len(candidate_texts)} answers, got {len(answers)}. Assuming non-duplicates.")
        answers = [False] * len(candidate_texts)
        
    return answers

def process_group(group_df):
    """
    Process one group of potential duplicates using an iterative LLM-powered approach.
    """
    df_unique_group = pd.DataFrame(columns=group_df.columns)
    candidates = group_df.copy().reset_index(drop=True)
    
    while not candidates.empty:
        if df_unique_group.empty:
            # Use the first candidate as the base news
            base_row = candidates.iloc[0]
            df_unique_group = pd.concat(
                [df_unique_group, base_row.to_frame().T], 
                ignore_index=True
            )
            candidates = candidates.drop(candidates.index[0]).reset_index(drop=True)
        else:
            base_row = df_unique_group.iloc[-1]
        
        if candidates.empty:
            break
        
        base_title = base_row['News Title']
        base_description = base_row['Description']
        
        # Prepare candidate texts by combining title and description for each candidate
        candidate_texts = []
        for _, row in candidates.iterrows():
            candidate_text = f"News Title: {row['News Title']}\nNews Description: {row['Description']}"
            candidate_texts.append(candidate_text)
        
        duplicate_flags = call_gemini_api(base_title, base_description, candidate_texts)
        
        # Filter out duplicates based on the LLM's response
        non_duplicate_indices = [idx for idx, is_dup in enumerate(duplicate_flags) if not is_dup]
        candidates = candidates.iloc[non_duplicate_indices].reset_index(drop=True)
        
        if not candidates.empty:
            next_row = candidates.iloc[0]
            df_unique_group = pd.concat(
                [df_unique_group, next_row.to_frame().T], 
                ignore_index=True
            )
            candidates = candidates.drop(candidates.index[0]).reset_index(drop=True)
    
    return df_unique_group

def process_accidents(df):
    unique_groups = []
    for (district, accident_date), group in df.groupby(['District', 'Accident Date']):
        unique_group = process_group(group)
        unique_groups.append(unique_group)

    # If no groups were processed, return an empty dataframe with the same columns
    if not unique_groups:
        return pd.DataFrame(columns=df.columns)

    return pd.concat(unique_groups, ignore_index=True)

def process_accidents_publish_date(df):
    unique_groups = []
    for (district, publish_date), group in df.groupby(['District', 'Publish Date']):
        unique_group = process_group(group)
        unique_groups.append(unique_group)

    if not unique_groups:
        return pd.DataFrame(columns=df.columns)

    return pd.concat(unique_groups, ignore_index=True)

# Assume duplicates_df contains both duplicate and unique news records
if len(duplicates_df) != 0:
    # Load your dataframe (replace with actual data loading)
    # df = pd.read_csv('your_data.csv')
    df = duplicates_df  # Replace with your actual dataframe

    # Process duplicates
    df_unique_temp = process_accidents(df)
    df_unique = process_accidents_publish_date(df_unique_temp)

    print("Unique accident news:")
    print(df_unique.head(10))


df_unique contains only unique entries without any duplicates

In [None]:
df_unique.head(10)

# Simple preprocessing and Heatmap Generation #

In [None]:
# Convert both columns to integers
df_unique['Killed'] = pd.to_numeric(df_unique['Killed'], errors='coerce').fillna(0).astype(int)
df_unique['Injured'] = pd.to_numeric(df_unique['Injured'], errors='coerce').fillna(0).astype(int)

In [None]:
# !pip install geopandas

In [None]:
import geopandas as gpd

# Read the GeoJSON file into a GeoDataFrame
gdf = gpd.read_file('Duplicate_removal/Bangladesh_GeoJSON/bangladesh_geojson_adm2_64_districts_zillas.json')

# Display the first few rows of the GeoDataFrame
print(gdf.head())

# (Optional) Quick plot to visually inspect the boundaries
gdf.plot(figsize=(10, 8))

In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

# -------------------------------
# Step 1: Prepare Your DataFrame
# -------------------------------
# data = {
#     'District': [
#         'Bagerhat', 'Bandarban', 'Barguna', 'Barisal', 'Bhola', 
#         'Bagerhat', 'Barguna', 'Bandarban', 'Barisal', 'Bagerhat'
#     ]
# }
# df_unique = pd.DataFrame(data)

# Count occurrences for each district
df_counts = df_unique['District'].value_counts().reset_index()
df_counts.columns = ['District', 'count']

print("District counts:\n", df_counts)

# -------------------------------
# Step 2: Load the GeoJSON File
# -------------------------------
gdf = gpd.read_file('Duplicate_removal/Bangladesh_GeoJSON/bangladesh_geojson_adm2_64_districts_zillas.json')

# -------------------------------
# Step 3: Merge Data with GeoDataFrame
# -------------------------------
merged = gdf.merge(df_counts, left_on='ADM2_EN', right_on='District', how='left')
merged['count'] = merged['count'].fillna(0)

# -------------------------------
# Step 4: Plot the Choropleth Map
# -------------------------------
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
merged.plot(
    column='count',
    cmap='OrRd',
    linewidth=0.8,
    ax=ax,
    edgecolor='0.8',
    legend=True
)

ax.set_title('Heatmap of Districtwise Accident Occurrence in Bangladesh', fontsize=15)
ax.set_axis_off()

# -------------------------------
# Step 5: Annotate District Names
# -------------------------------
for idx, row in merged.iterrows():
    # Use a representative point to ensure the label appears inside the polygon
    rep_point = row['geometry'].representative_point()
    # Pass the text as the first positional argument instead of using s=
    ax.annotate(row['ADM2_EN'], (rep_point.x, rep_point.y),
                horizontalalignment='center', fontsize=6, color='black')

# -------------------------------
# Step 6: Display the Map
# -------------------------------
plt.show()


In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

# -------------------------------
# Step 1: Load Your Data
# -------------------------------
# Sample DataFrame with district-wise fatalities and injuries
# data = {
#     'District': ['Bagerhat', 'Bandarban', 'Barguna', 'Barisal', 'Bhola', 
#                  'Bagerhat', 'Barguna', 'Bandarban', 'Barisal', 'Bagerhat'],
#     'Killed':   [5, 3, 8, 2, 6, 7, 4, 1, 3, 9],
#     'Injured':  [10, 5, 15, 8, 12, 14, 6, 3, 7, 11]
# }
# df_unique = pd.DataFrame(data)

# Aggregate the total 'Killed' and 'Injured' counts for each district
df_counts = df_unique.groupby('District').sum().reset_index()

print("Aggregated district data:\n", df_counts)

# -------------------------------
# Step 2: Load the GeoJSON File
# -------------------------------
gdf = gpd.read_file('Duplicate_removal/Bangladesh_GeoJSON/bangladesh_geojson_adm2_64_districts_zillas.json')  # Update path

# -------------------------------
# Step 3: Merge Data with GeoDataFrame
# -------------------------------
merged = gdf.merge(df_counts, left_on='ADM2_EN', right_on='District', how='left')
merged[['Killed', 'Injured']] = merged[['Killed', 'Injured']].fillna(0)  # Fill missing values with 0

# -------------------------------
# Step 4: Define a Function to Plot Heatmaps
# -------------------------------
def plot_heatmap(geo_df, column, title, cmap):
    fig, ax = plt.subplots(1, 1, figsize=(12, 8))
    geo_df.plot(
        column=column,
        cmap=cmap,
        linewidth=0.8,
        ax=ax,
        edgecolor='0.8',
        legend=True
    )
    
    ax.set_title(title, fontsize=15)
    ax.set_axis_off()

    # Add district names on the map
    for idx, row in geo_df.iterrows():
        rep_point = row['geometry'].representative_point()
        ax.annotate(row['ADM2_EN'], (rep_point.x, rep_point.y), 
                    horizontalalignment='center', fontsize=8, color='black')

    plt.show()

# -------------------------------
# Step 5: Generate Two Heatmaps
# -------------------------------
plot_heatmap(merged, 'Killed', 'Heatmap of Fatalities in Bangladesh', cmap='Reds')
plot_heatmap(merged, 'Injured', 'Heatmap of Injuries in Bangladesh', cmap='Greens')


In [None]:
import pandas as pd
import google.generativeai as genai
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Initialize Google GenAI (ensure your API key is set up)
genai.configure(api_key="AIzaSyBr9HVc7wmhRk3yqKygq_pwKjgCwj2U19k")
model = genai.GenerativeModel("gemini-2.0-flash")

def normalize_time(time_str):
    if pd.isna(time_str) or time_str.lower() in ["not mentioned", "unknown", "n/a", "-"]:
        return None
    prompt = f"Convert the following time description into a 24-hour format (HH:MM). Provide only the time in HH:MM format and nothing else: {time_str}"
    response = model.generate_content(prompt)
    extracted_time = re.search(r'\b\d{1,2}:\d{2}\b', response.text)
    return extracted_time.group(0) if extracted_time else None

# # Sample DataFrame with mixed time formats
# data = {'Time of Accident': ["7:00 AM", "3:00 PM", "Afternoon", "morning", "11:45 PM", "Noon", "Midnight", "not mentioned", None]}
# df_unique = pd.DataFrame(data)

# Normalize time using GenAI
df_unique['Normalized Time'] = df_unique['Time of Accident'].apply(normalize_time)

# Remove rows with missing or unrecognized time
df_unique = df_unique.dropna(subset=['Normalized Time'])

# Convert to datetime format
df_unique['Hour'] = pd.to_datetime(df_unique['Normalized Time'], format='%H:%M', errors='coerce').dt.hour

# Remove NaN values caused by conversion errors
df_unique = df_unique.dropna(subset=['Hour'])

# Convert hour to integer
df_unique['Hour'] = df_unique['Hour'].astype(int)

# Create hourly bins
time_bins = [f"{i:02d}:00-{i+1:02d}:00" for i in range(24)]
hourly_counts = df_unique['Hour'].value_counts().reindex(range(24), fill_value=0)

# Plot histogram
plt.figure(figsize=(12, 6))
plt.bar(time_bins, hourly_counts.values, color='royalblue', alpha=0.7)
plt.xlabel("Time of Day (Hourly Intervals)")
plt.ylabel("Number of Accidents")
plt.title("Temporal Distribution of Accidents")
plt.xticks(rotation=90)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
df_unique.head()