In [1]:
!pip install mlflow ipywidgets transformers huggingface_hub --quiet
from google.colab import output
output.enable_custom_widget_manager()


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.2/28.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.0/85.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m692.3/692.3 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:

import pandas as pd
import time
import mlflow
import calendar
import re
from transformers import pipeline
from huggingface_hub import hf_hub_download
import ipywidgets as widgets
from IPython.display import display
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)

# Set MLflow remote URI
mlflow.set_tracking_uri("http://3.145.180.106:5000")
mlflow.set_experiment("Cincinnati Crime Chatbot")

def load_crime_data():
    csv_path = hf_hub_download(
        repo_id="mlsystemsg1/cincinnati-crime-data",
        repo_type="dataset",
        filename="calls_for_service_latest.csv"
    )
    df = pd.read_csv(csv_path, low_memory=False)
    df.columns = [col.lower() for col in df.columns]
    keep_cols = ['create_time_incident', 'incident_type_desc', 'event_number', 'sna_neighborhood', 'priority']
    df = df[keep_cols]
    df.dropna(subset=['sna_neighborhood'], inplace=True)
    df['create_time_incident'] = pd.to_datetime(df['create_time_incident'], errors='coerce')
    return df

def get_relevant_rows(question, df):
    q = question.lower()
    filtered = df.copy()

    # Neighborhood match
    for hood in df['sna_neighborhood'].dropna().unique():
        if hood.lower() in q:
            filtered = filtered[filtered['sna_neighborhood'].str.lower() == hood.lower()]
            break

    # Offense match
    for offense in df['incident_type_desc'].dropna().unique():
        if offense.lower() in q:
            filtered = filtered[filtered['incident_type_desc'].str.lower() == offense.lower()]
            break

    # Exact date (YYYY-MM-DD)
    date_match = re.search(r"(20\\d{2})[-/](\\d{1,2})[-/](\\d{1,2})", q)
    if date_match:
        y, m, d = map(int, date_match.groups())
        filtered = filtered[filtered['create_time_incident'].dt.date == pd.Timestamp(y, m, d).date()]
        return filtered

    # Month match
    months = {month.lower(): i for i, month in enumerate(calendar.month_name) if month}
    for name, num in months.items():
        if name in q:
            filtered = filtered[filtered['create_time_incident'].dt.month == num]
            break

    # Year match
    year_match = re.search(r"(20\\d{2})", q)
    if year_match:
        filtered = filtered[filtered['create_time_incident'].dt.year == int(year_match.group(1))]

    # Sort if asking for most recent/latest offense
    if any(x in q for x in ["latest", "most recent", "last offense", "recent"]):
        filtered = filtered.sort_values(by='create_time_incident', ascending=False)

    return filtered


def generate_summary(question, df):
    if df.empty:
        return "No matching records found."
    if any(word in question.lower() for word in ["how many", "count", "number of"]):
        return f"There were {len(df)} incidents matching your query."
    examples = []
    for _, row in df.head(10).iterrows():
        date = row['create_time_incident']
        offense = row['incident_type_desc']
        hood = row['sna_neighborhood']
        incident = row['event_number']
        priority = row.get('priority', 'N/A')
        examples.append(f"On {date.date()}, a {offense} (Priority {priority}) occurred in {hood} (Incident #{incident}).")
    return "\n".join(examples)

def answer_with_llm(question, data_rows, model, model_name="google/flan-t5-small", prompt_version="v5"):
    if data_rows.empty:
        return "Sorry, I couldn't find any data matching that question."
    context = generate_summary(question, data_rows)
    prompt = f"""
You are a helpful assistant analyzing Cincinnati crime data.

Here are some relevant data points:
{context}

Now answer this question based on the above:
{question}
    """.strip()
    start_time = time.time()
    with mlflow.start_run():
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("question", question)
        mlflow.log_param("num_rows_context", len(data_rows))
        mlflow.log_param("prompt_version", prompt_version)
        mlflow.log_text(prompt, "prompt.txt")
        result = model(prompt, max_new_tokens=150)[0]['generated_text']
        answer = result.strip()
        latency = time.time() - start_time
        mlflow.log_text(answer, "answer.txt")
        mlflow.log_metric("response_length", len(answer))
        mlflow.log_metric("latency_sec", latency)
        mlflow.log_metric("match_count", len(data_rows))
        return answer

def run_single_chatbot_turn(df, model, question):
    if question.lower() in ['exit', 'quit']:
        return "Goodbye!"
    rows = get_relevant_rows(question, df)
    return answer_with_llm(question, rows, model)

def ask_question_with_widget(df, model):
    input_box = widgets.Text(
        placeholder='Ask a question about crime in Cincinnati...',
        description='Question:',
        layout=widgets.Layout(width='100%')
    )
    output = widgets.Output()
    def on_submit(change):
        output.clear_output()
        question = change['new']
        with output:
            print("Bot:", run_single_chatbot_turn(df, model, question))
    input_box.observe(on_submit, names='value')
    display(input_box, output)

# Launch everything
print("Loading dataset and model...")
df = load_crime_data()
print("Columns in dataset:", df.columns.tolist())
llm_model = pipeline("text2text-generation", model="google/flan-t5-small")
print("\n✅ Chatbot is ready! Ask your question below:")
ask_question_with_widget(df, llm_model)


Loading dataset and model...
Columns in dataset: ['create_time_incident', 'incident_type_desc', 'event_number', 'sna_neighborhood', 'priority']


Device set to use cpu



✅ Chatbot is ready! Ask your question below:


Text(value='', description='Question:', layout=Layout(width='100%'), placeholder='Ask a question about crime i…

Output()

In [6]:
print("Most recent incident date in dataset:")
print(df['create_time_incident'].max())

print("\nOldest incident date:")
print(df['create_time_incident'].min())

print("\nNumber of missing date entries:")
print(df['create_time_incident'].isna().sum())



Most recent incident date in dataset:
2021-09-20 00:44:18

Oldest incident date:
2014-09-30 23:52:08

Number of missing date entries:
130
