In [12]:
"""
AI Search Prompt for Dallas County Law-Enforcement & True-Crime Stories
Author: Sripriya Sharma
Runtime: ~45 seconds (Google/Bing News Aggregation)
"""

'\nAI Search Prompt for Dallas County Law-Enforcement & True-Crime Stories\nAuthor: Sripriya Sharma\nRuntime: ~45 seconds (Google/Bing News Aggregation)\n'

In [13]:
!pip install serpapi pandas python-dateutil tqdm beautifulsoup4 requests lxml




In [1]:
!pip install --upgrade --force-reinstall google-search-results


Collecting google-search-results
  Using cached google_search_results-2.4.2-py3-none-any.whl
Collecting requests (from google-search-results)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting charset_normalizer<4,>=2 (from requests->google-search-results)
  Using cached charset_normalizer-3.4.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (37 kB)
Collecting idna<4,>=2.5 (from requests->google-search-results)
  Using cached idna-3.11-py3-none-any.whl.metadata (8.4 kB)
Collecting urllib3<3,>=1.21.1 (from requests->google-search-results)
  Using cached urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests->google-search-results)
  Using cached certifi-2025.10.5-py3-none-any.whl.metadata (2.5 kB)
Using cached requests-2.32.5-py3-none-any.whl (64 kB)
Using cached certifi-2025.10.5-py3-none-any.whl (163 kB)
Using cached charset_normalizer-3.4.4-cp312-cp312-manylinux2014_x86_64.many

In [2]:
from serpapi import GoogleSearch


In [4]:
from serpapi import GoogleSearch
print("SerpApi successfully imported ")


SerpApi successfully imported 


In [5]:
# Imports and keyword lists
import os
import re
import time
from datetime import datetime
from dateutil import parser as dateparser
import pandas as pd
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup

# FILTER-IN keywords (lowercase for checking)
FILTER_IN = [
    # Law-Enforcement Anchors
    "police","officer","officers","sheriff","deputy","deputies","constable","trooper",
    "troopers","state trooper","highway patrol","patrol officer","law enforcement","cop","cops",
    "detective","investigator","sergeant","lieutenant","captain","chief of police","police chief",
    "deputy chief","command staff","special agent","federal agent","peace officer","arrest",
    "arrested","arrests","arresting",
    # Body-Cam / Incident Extensions
    "body cam","bodycam","body-worn","body worn","bwc","critical incident","critical-incident",
    "dash cam","dashcam","officer-involved","ois","traffic stop","pursuit","police chase",
    "high-speed chase","high speed chase","standoff","foot chase","use of force","department release",
    "released footage","released video","surveillance video","cctv","ring camera","doorbell camera",
    # Context crimes/arrest scenarios
    "dui arrest","drunk driving","dui","dwi","domestic violence","shoplifting","reckless driving",
    "assault arrest","theft suspect","burglary suspect","disturbance call","trespassing","resisting arrest",
    "disorderly conduct","hit-and-run","tasing incident","pepper spray","protest arrest","crowd control",
    "fight outside bar","fleeing suspect","drug trafficking","fraud scheme","scam arrest"
]

# FILTER-OUT phrases (if any of these appear -> drop)
FILTER_OUT = [
    "fatal shooting","deadly shooting","shooting death","shooting incident","gunfire death",
    "fatal gunfire","deadly gunfire","gunfire exchange","gun battle","gunfight","shootout",
    "deadly shootout","fatal shootout","police shootout","fatal encounter","deadly encounter",
    "officer-involved shooting","officer involved shooting","fatal police encounter",
    "police-involved shooting","fatal officer-involved incident","fatal officer involved incident",
    "shot and killed","was shot and killed"
]

# make all lowercase
FILTER_IN = [s.lower() for s in FILTER_IN]
FILTER_OUT = [s.lower() for s in FILTER_OUT]


In [6]:
# User inputs
print("Enter date range for the search (YYYY-MM-DD). Use ISO format to avoid ambiguity.")
start_date = input("Start date (YYYY-MM-DD): ").strip()
end_date = input("End date (YYYY-MM-DD): ").strip()

# Validate dates
try:
    sd = datetime.strptime(start_date, "%Y-%m-%d")
    ed = datetime.strptime(end_date, "%Y-%m-%d")
    if ed < sd:
        raise ValueError("End date must be >= start date.")
except Exception as e:
    raise SystemExit(f"Date parse error: {e}")

# SerpApi key
SERPAPI_KEY = input("Enter your SerpApi API key (or press Enter to run in SAMPLE MODE): ").strip()

# Local / Regional outlets — used for query scoping (site: domain)
SOURCES = [
    "dallasnews.com","nbcdfw.com","irvingjournal.com","gptx.org","mesquitegazette.com",
    "dallasobserver.com","dallasexpress.com","aldiadallas.com","dallasweekly.com",
    "dallasfreepress.com","dallasexaminer.com","bizjournals.com/dallas","focusdailynews.com",
    "starlocalmedia.com","wfaa.com","fox4news.com","cbsnews.com/texas","spectrumlocalnews.com/tx/dallas-fort-worth",
    "dmagazine.com"
]


Enter date range for the search (YYYY-MM-DD). Use ISO format to avoid ambiguity.
Start date (YYYY-MM-DD): 2025-01-01
End date (YYYY-MM-DD): 2025-10-01
Enter your SerpApi API key (or press Enter to run in SAMPLE MODE): 9626e070aa619fe245cdd0e3c578bba2d84d35ce7fbaafe80504a21efb8d665d


In [7]:
# Helper functions

def normalize_text(s):
    if not s:
        return ""
    return re.sub(r'\s+', ' ', s).strip().lower()

def contains_filter_in(text):
    t = normalize_text(text)
    return any(k in t for k in FILTER_IN)

def contains_filter_out(text):
    t = normalize_text(text)
    return any(k in t for k in FILTER_OUT)

def article_is_valid(title, snippet, body_text):

    big_text = " ".join([title or "", snippet or "", body_text or ""]).lower()
    if not contains_filter_in(big_text):
        return False
    if contains_filter_out(big_text):
        return False
    return True

def dedupe_articles(df):
    # primary dedupe by URL, then by (title + source)
    df = df.drop_duplicates(subset=["Direct URL"])
    # additional dedupe by normalized title+source
    df["title_src_norm"] = df["Article Title"].fillna("").str.lower().str.strip() + " || " + df["Source/Outlet"].fillna("").str.lower()
    df = df.drop_duplicates(subset=["title_src_norm"])
    df = df.drop(columns=["title_src_norm"])
    return df.reset_index(drop=True)


In [8]:
# Live fetch via SerpApi. Requires SERPAPI_KEY.
if SERPAPI_KEY:
    from serpapi.google_search import GoogleSearch

    print("Running live SerpApi queries. This may take 10-60s depending on date range and rate limits.")
    ...


    # prepare site scoping as OR of site:domain terms
    site_scope = " OR ".join([f"site:{s}" for s in SOURCES])

    # prepare filter_in query (OR of short anchor keywords to keep query compact)
    # Using a subset for query efficiency; we still filter more strictly after fetch.
    query_in_short = " OR ".join([
        "police","officer","sheriff","deputy","arrest","body cam","traffic stop","pursuit","police chase",
        "use of force","dui","domestic violence","shoplifting","burglary","hit-and-run","drug trafficking"
    ])

    # SerpApi date filtering: use 'from' and 'to' in query string (Google News supports after: before:)
    q = f"({site_scope}) ({query_in_short}) after:{start_date} before:{end_date}"

    params = {
        "engine": "google_news",
        "q": q,
        "api_key": SERPAPI_KEY,
        "num": 10,   # results per request (SerpApi may limit). We'll paginate.
    }

    all_articles = []
    start = 0
    max_pages = 12  # safety cap: at most 12*10 = 120 results; increase if needed and permitted by your plan
    for page in range(max_pages):
        params["start"] = start
        try:
            search = GoogleSearch(params)
            res = search.get_dict()
        except Exception as e:
            print("SerpApi request error:", e)
            break

        news_results = res.get("news_results") or res.get("news") or []
        if not news_results:
            # if nothing returned, try breaking (no more pages)
            break

        for item in news_results:
            title = item.get("title") or item.get("headline") or ""
            link = item.get("link") or item.get("source_url") or ""
            source = item.get("source", {}).get("name") if isinstance(item.get("source"), dict) else item.get("source") or item.get("publisher") or ""
            date = item.get("date") or item.get("published") or item.get("snippet_date") or ""
            snippet = item.get("snippet") or item.get("description") or ""
            # append raw; we'll fetch body optionally below
            all_articles.append({
                "Article Title": title,
                "Publication Date": date,
                "Source/Outlet": source,
                "Direct URL": link,
                "Snippet": snippet
            })

        # prepare next page
        start += params.get("num", 10)
        # if results fewer than requested, break early
        if len(news_results) < params.get("num", 10):
            break
        time.sleep(1)  # gentle rate-limiting

    print(f"Fetched {len(all_articles)} candidate articles from SerpApi.")
else:
    # SAMPLE MODE: no SerpApi key provided. Create a small sample dataset to test the filtering/deduping pipeline.
    print("SAMPLE MODE: No SerpApi key provided. Creating a small sample dataset for testing.")
    sample = [
        {
            "Article Title": "Dallas police arrest burglary suspect after doorbell camera video",
            "Publication Date": "2025-10-10",
            "Source/Outlet": "Mesquite Gazette",
            "Direct URL": "https://mesquitegazette.com/article/doorbell-camera-burglary-arrest",
            "Snippet": "Officers arrested the suspect after reviewing doorbell camera footage."
        },
        {
            "Article Title": "Downtown protest: 3 protest arrest after crowd control measures",
            "Publication Date": "2025-10-08",
            "Source/Outlet": "Dallas Observer",
            "Direct URL": "https://dallasobserver.com/article/protest-arrests",
            "Snippet": "Police arrested several in crowd control operation."
        },
        {
            "Article Title": "Community meeting about a deadly shooting last month",
            "Publication Date": "2025-10-06",
            "Source/Outlet": "Local News",
            "Direct URL": "https://localnews.com/article/deadly-shooting-community",
            "Snippet": "A fatal shooting is under review by investigators."
        },
        {
            "Article Title": "Small business profile: owner talks shop",
            "Publication Date": "2025-10-09",
            "Source/Outlet": "D Magazine",
            "Direct URL": "https://dmagazine.com/article/business-profile",
            "Snippet": "Profile about a business owner; nothing to do with police."
        }
    ]
    all_articles = sample


Running live SerpApi queries. This may take 10-60s depending on date range and rate limits.
Fetched 1200 candidate articles from SerpApi.


In [9]:
# Attempt to fetch body text for each article to improve filtering precision.
# This is optional and kept lightweight: we'll attempt a short HTML fetch and extract text from <p> tags.
def try_fetch_body_text(url, max_chars=8000):
    try:
        resp = requests.get(url, timeout=8, headers={"User-Agent":"Mozilla/5.0"})
        if resp.status_code != 200:
            return ""
        soup = BeautifulSoup(resp.text, "lxml")
        # gather visible paragraphs
        paragraphs = [p.get_text(separator=" ", strip=True) for p in soup.find_all("p")]
        text = " ".join(paragraphs)
        return text[:max_chars]
    except Exception:
        return ""

# Only fetch bodies for live mode (and keep it optional small to avoid long runs)
fetch_bodies = True if SERPAPI_KEY else False

if fetch_bodies:
    print("Fetching article bodies (may add time).")
    for art in tqdm(all_articles):
        if not art.get("Direct URL"):
            art["Body"] = ""
            continue
        art["Body"] = try_fetch_body_text(art["Direct URL"])
else:
    for art in all_articles:
        art["Body"] = ""


Fetching article bodies (may add time).


100%|██████████| 1200/1200 [07:09<00:00,  2.79it/s]


In [10]:
# Build DataFrame and apply filtering
rows = []
for art in all_articles:
    title = art.get("Article Title") or ""
    snippet = art.get("Snippet") or ""
    body = art.get("Body") or ""
    source = art.get("Source/Outlet") or ""
    url = art.get("Direct URL") or ""
    pubdate = art.get("Publication Date") or ""

    if article_is_valid(title, snippet, body):
        # Normalize publication date into ISO if possible
        parsed_date = ""
        try:
            parsed_date = dateparser.parse(pubdate).strftime("%Y-%m-%d")
        except Exception:
            # if SerpApi provides epoch or other format, keep raw
            parsed_date = pubdate

        rows.append({
            "Article Title": title.strip(),
            "Publication Date": parsed_date,
            "Source/Outlet": source.strip(),
            "Direct URL": url.strip()
        })

df = pd.DataFrame(rows, columns=["Article Title","Publication Date","Source/Outlet","Direct URL"])
print(f"After filtering: {len(df)} articles retained.")


After filtering: 432 articles retained.


In [11]:
# Deduplicate and save CSV
df_clean = dedupe_articles(df)
print(f"After deduplication: {len(df_clean)} articles.")

# Show top rows
if not df_clean.empty:
    display(df_clean.head(20))
else:
    print("No articles matched the filter criteria in this run.")

# Save CSV
out_fname = f"Dallas_County_LawEnforcement_{start_date}_to_{end_date}.csv"
df_clean.to_csv(out_fname, index=False)
print("Saved CSV to:", out_fname)

# Provide a simple download link in Colab environment
from google.colab import files
try:
    files.download(out_fname)
except Exception:
    print("files.download not available in this environment; you can download the file from the left 'Files' pane.")


After deduplication: 36 articles.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["title_src_norm"] = df["Article Title"].fillna("").str.lower().str.strip() + " || " + df["Source/Outlet"].fillna("").str.lower()


Unnamed: 0,Article Title,Publication Date,Source/Outlet,Direct URL
0,2 dead after twin-engine plane crashes into tr...,"10/12/2025, 08:50 PM, +0000 UTC",NBC 5 Dallas-Fort Worth,https://www.nbcdfw.com/news/local/plane-crashe...
1,Fort Worth could soon restrict where registere...,"10/14/2025, 10:15 PM, +0000 UTC",NBC 5 Dallas-Fort Worth,https://www.nbcdfw.com/news/local/fort-worth-r...
2,What are the propositions to amend the Texas C...,"10/13/2025, 10:03 PM, +0000 UTC",NBC 5 Dallas-Fort Worth,https://www.nbcdfw.com/news/politics/lone-star...
3,"Feeling hopeless in custody, many drop claims ...","10/12/2025, 07:08 PM, +0000 UTC",NBC 5 Dallas-Fort Worth,https://www.nbcdfw.com/news/national-internati...
4,Mother calls for consequences for teacher char...,"10/15/2025, 02:37 AM, +0000 UTC",NBC 5 Dallas-Fort Worth,https://www.nbcdfw.com/news/local/teacher-char...
5,Woman facing manslaughter charge in kayaker's ...,"10/13/2025, 06:54 PM, +0000 UTC",NBC 5 Dallas-Fort Worth,https://www.nbcdfw.com/news/local/woman-facing...
6,"Lake Worth Police search for dog, owner after ...","10/11/2025, 02:27 AM, +0000 UTC",NBC 5 Dallas-Fort Worth,https://www.nbcdfw.com/news/local/lake-worth-p...
7,"Family, friends remember Dallas security guard...","10/14/2025, 02:50 AM, +0000 UTC",NBC 5 Dallas-Fort Worth,https://www.nbcdfw.com/news/local/dallas-secur...
8,Tarrant County Sheriff's Office work detail pr...,"10/12/2025, 10:45 PM, +0000 UTC",NBC 5 Dallas-Fort Worth,https://www.nbcdfw.com/news/local/tarrant-coun...
9,Brake-checking a tailgater led to deadly road ...,"10/11/2025, 12:46 AM, +0000 UTC",NBC 5 Dallas-Fort Worth,https://www.nbcdfw.com/news/local/frisco-warra...


Saved CSV to: Dallas_County_LawEnforcement_2025-01-01_to_2025-10-01.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
# Quick stats: counts by source
if not df_clean.empty:
    print("Top sources by count:")
    print(df_clean["Source/Outlet"].value_counts().head(20))
    # Quick check: any articles containing FILTER_OUT (should be zero)
    contains_out = df_clean["Article Title"].str.lower().apply(lambda t: any(k in t for k in FILTER_OUT))
    if contains_out.any():
        print("WARNING: Some retained articles contain FILTER_OUT phrases. Check rows:")
        display(df_clean[contains_out])
    else:
        print("FILTER_OUT check passed: no retained article contains disallowed phrases.")
else:
    print("No retained articles to summarize.")


Top sources by count:
Source/Outlet
NBC 5 Dallas-Fort Worth    22
Dallas News                14
Name: count, dtype: int64
FILTER_OUT check passed: no retained article contains disallowed phrases.
