In [None]:
## using Jinja2 for Text Summarization
# Jinja2 to dynamically generate the refined summary for your trajectory data

!pip install Jinja2



In [None]:
import os
import re
from collections import defaultdict
from jinja2 import Template

global aspects

representative_data_path = '/content/output_ocorrencias[21] rc 5 rv 8 - z14.csv'

if not os.path.exists(representative_data_path):
    raise FileNotFoundError(f"File not found: {representative_data_path}")

try:
    with open(representative_data_path, 'r', encoding='utf-8') as file:
        representative_data_raw = file.readlines()
except Exception as e:
    raise RuntimeError(f"Error reading file: {e}")

def pre_processing():
    def parse_representative_data(raw_lines):
        global aspects

        parsed_data = {
            "metadata": {},
            "settings": {},
            "trajectory_description": []
        }

        current_section = None
        for line in raw_lines:
            line = line.strip()

            if "Info input dataset:" in line:
                current_section = "metadata"
            elif "RT setting infos:" in line:
                current_section = "settings"
            elif "RT description:" in line:
                current_section = "trajectory_description"
            elif line.startswith("##"):
                current_section = None  # End of a section

            elif current_section == "metadata":
                if re.match(r'\|.*\|', line):
                    headers = [h.strip() for h in line.split(",")]
                else:
                    values = [v.strip() for v in line.split(",")]
                    for h, v in zip(headers, values):
                        parsed_data["metadata"][h] = v

            elif current_section == "settings":
                if re.match(r'thresholdCellSize', line):
                    settings_headers = [h.strip() for h in line.split(",")]
                else:
                    settings_values = [v.strip() for v in line.split(",")]
                    for h, v in zip(settings_headers, settings_values):
                        parsed_data["settings"][h] = v

            elif current_section == "trajectory_description":
                if re.match(r'lat_lon', line):
                    trajectory_headers = [h.strip() for h in line.split(",")]
                else:
                    trajectory_values = [v.strip() for v in line.split(",")]
                    parsed_data["trajectory_description"].append(
                        dict(zip(trajectory_headers, trajectory_values))
                    )

                aspects = trajectory_headers
        return parsed_data

    structured_data = parse_representative_data(representative_data_raw)
    print(structured_data)

    aspects_to_analyze = ['NATUREZA1_DESCRICAO', 'WEATHER']  # substitui 'POI' por 'NATUREZA1_DESCRICAO'

    def analyze_aspect_patterns(data, aspects):
        """Analisa padrões na trajetória para os aspectos especificados."""
        trajectory = data["trajectory_description"]
        patterns = defaultdict(lambda: {"values": defaultdict(int), "count": 0})

        for aspect in trajectory:
            for aspect_name in aspects:
                if aspect_name in aspect:
                    patterns[aspect_name]["count"] += 1
                    patterns[aspect_name]["values"][aspect[aspect_name]] += 1

        summary_parts = []
        for aspect_name, pattern_data in patterns.items():
            if pattern_data["count"] > 0:
                values_dict = pattern_data["values"]
                if len(values_dict) == 1:
                    value = next(iter(values_dict))
                    summary_parts.append(f"Usually, the object's {aspect_name.lower()} is {value}")
                else:
                    most_frequent_value = max(values_dict, key=values_dict.get)
                    if pattern_data["values"][most_frequent_value] == pattern_data["count"]:
                        summary_parts.append(f"Usually, the object's {aspect_name.lower()} is {most_frequent_value}")
        return " ".join(summary_parts)

    def generate_summary(data):
        metadata_summary = (
            f"The representative dataset contains {data['metadata'].get('|input.T|', 'N/A')} trajectories "
            f"with a total of {data['metadata'].get('|input.T.points|', 'N/A')} points.\n"
        )

        settings = data["settings"]
        settings_summary = (
            "The analysis used the following parameters:\n"
            f"- Threshold Cell Size: {settings.get('thresholdCellSize', 'N/A')}\n"
            f"- Cell Size: {settings.get('CellSize', 'N/A')}\n"
            f"- Relevant Cell Threshold: {settings.get('tauRelevantCell', 'N/A')}\n"
            f"- Representativeness Value Threshold: {settings.get('tauRepresentativenessValue', 'N/A')}\n"
            f"- Runtime: {settings.get('runtime start', 'N/A')} to {settings.get('runtime end', 'N/A')}.\n"
        )

        trajectory = data["trajectory_description"]
        trajectory_summary = "Key trajectory events:\n"
        for aspect in trajectory:
            trajectory_summary += (
                f"  - At {aspect.get('time', 'N/A')}, the object was related to {aspect.get('NATUREZA1_DESCRICAO', 'N/A')} "
                f"under {aspect.get('WEATHER', 'N/A')} weather conditions.\n"
            )

        trajectory_patterns_summary = analyze_aspect_patterns(data, aspects=aspects_to_analyze)

        summary = metadata_summary + settings_summary + trajectory_summary + trajectory_patterns_summary
        return summary

    summary = generate_summary(structured_data)

    def time_of_day(hour):
        if 5 <= hour < 12:
            return "morning"
        elif 12 <= hour < 17:
            return "afternoon"
        elif 17 <= hour < 21:
            return "evening"
        else:
            return "night"

    def format_natureza(natureza):
      """Formata e filtra naturezas com frequência >= 0.4."""
      if not natureza:
          return "unknown nature"

      if not isinstance(natureza, str) or ":" not in natureza:
          return natureza.strip()

      items = re.findall(r'([^:;]+):\s*([\d.]+)', natureza)
      filtered = []
      for name, freq in items:
          freq = float(freq)
          if freq >= 0.4:
              percentage = round(freq * 100)
              filtered.append(f"{name.strip()} ({percentage}%)")

      return ", ".join(filtered) if filtered else "N/A"


    def format_weather(weather):
        matches = re.findall(r'{(.*?):\s*([\d\.]+)}', weather)
        if matches:
            conditions = []
            for condition, freq in matches:
                freq = float(freq)
                if freq == 1.0:
                    conditions.append(f"{condition.lower()} (100% frequency)")
                else:
                    percentage = int(freq * 100)
                    conditions.append(f"{condition.lower()} ({percentage}%)")
            return " and ".join(conditions)
        return weather

    return structured_data, time_of_day, format_weather, format_natureza

structured_data, time_of_day, format_weather, format_natureza = pre_processing()



{'metadata': {'|input.T|': '1', '|input.T.points|': '160'}, 'settings': {'thresholdCellSize': '14', 'CellSize': '0.06548267239406705', 'tauRelevantCell': '0.05', 'tauRepresentativenessValue': '0.08', '|cell|': '12', 'minPointsRC': '8.0', '|rt|': '27', '|coverPoints|': '147'}, 'trajectory_description': [{'lat_lon': '-25.51013115 -49.222093', 'time': '15/12/2022 00:00:13 - 15/12/2022 00:02:26', 'ATENDIMENTO_BAIRRO_NOME': '{UBERABA: 0.5; ALTO BOQUEIRÃO: 0.5}', 'NATUREZA1_DESCRICAO': '{INVASÃO: 0.5; QUEIMA A CÉU ABERTO: 0.5}', 'SUBCATEGORIA1_DESCRICAO': '{: 0.5; INVASÃO DE EQUIPAMENTO/PATRIMÔNIO PÚBLICO: 0.5}', 'REGIONAL_FATO_NOME': '{CAJURU: 0.5; BOQUEIRÃO: 0.5}', 'mapping': '{21: 1; 21: 4}'}, {'lat_lon': '-25.48492574 -49.21582452', 'time': '15/12/2022 00:07:46 - 15/12/2022 00:13:40', 'ATENDIMENTO_BAIRRO_NOME': '{CAJURU: 0.4; BOQUEIRÃO: 0.4; UBERABA: 0.2}', 'NATUREZA1_DESCRICAO': '{PATRULHA MARIA DA PENHA: 0.4; APOIO: 0.4; DESINTELIGÊNCIA: 0.2}', 'SUBCATEGORIA1_DESCRICAO': '{VISITA DE ME

In [None]:
import re
from datetime import datetime
from collections import defaultdict
from jinja2 import Template

# Função auxiliar para extrair a hora de uma string de data/hora
def extract_hour_from_time_string(time_string):
    time_part = time_string.split("-")[0].strip()
    for fmt in ("%d/%m/%Y %H:%M:%S", "%d/%m/%Y %H"):
        try:
            return datetime.strptime(time_part, fmt).hour
        except ValueError:
            continue
    raise ValueError(f"Formato de hora inválido: {time_part}")

def textual_descriptor(structured_data, time_of_day, format_weather, format_natureza):

    # Gera o resumo narrativo por período do dia
    def generate_narrative_summary(data):
        trajectory = data["trajectory_description"]
        time_groups = defaultdict(list)

        for event in trajectory:
            try:
                hour = extract_hour_from_time_string(event.get("time", ""))
                period = time_of_day(hour)
                time_groups[period].append(event)
            except Exception as e:
                print(f"Erro ao processar evento: {e}")

        summary_parts = []
        for period, events in time_groups.items():
            natureza_counts = defaultdict(int)
            weather_counts = defaultdict(int)
            transitions = []

            for i, event in enumerate(events):
                natureza = event.get("NATUREZA1_DESCRICAO", "N/A")
                formatted_natureza = format_natureza(natureza)
                if formatted_natureza == "N/A":
                    continue  # ignora eventos irrelevantes

                weather = event.get("WEATHER", "N/A")
                natureza_counts[natureza] += 1

                weather_counts[weather] += 1

                if i > 0:
                    prev_natureza = events[i - 1].get("NATUREZA1_DESCRICAO", "N/A")
                    formatted_prev = format_natureza(prev_natureza)
                    formatted_curr = format_natureza(natureza)

                    # Regra 1: ignorar transições N/A → N/A
                    if formatted_prev == "N/A" and formatted_curr == "N/A":
                        continue
                    # Regra 2: de N/A para válido OU de válido para N/A → apenas "moved to"
                    elif formatted_prev == "N/A" and formatted_curr != "N/A":
                        transitions.append(f"At {event['time']}, the object moved to {formatted_curr}.")
                    elif formatted_prev != "N/A" and formatted_curr == "N/A":
                        transitions.append(f"At {event['time']}, the object moved to an unknown nature.")
                    # Regra 3: ambos válidos e diferentes
                    elif formatted_prev != formatted_curr:
                        transitions.append(f"At {event['time']}, the object moved from {formatted_prev} to {formatted_curr}.")

            # Remove naturezas irrelevantes
            natureza_counts = {k: v for k, v in natureza_counts.items() if format_natureza(k) != "N/A"}

            if natureza_counts:
                most_common_natureza = max(natureza_counts, key=natureza_counts.get)
            else:
                continue  # Pula esse período inteiro se nada relevante

            most_common_weather = max(weather_counts, key=weather_counts.get)
            formatted_weather = format_weather(most_common_weather)

            summary_parts.append(
                f"In the {period}, the object is usually associated with {format_natureza(most_common_natureza)} "
                f"under weather conditions {formatted_weather}."
            )

            summary_parts.extend(transitions)

        return " ".join(summary_parts)

    # Refinamento de texto narrativo
    def refine_summary_text(summary_text):
        summary_text = summary_text.replace("frequently in ", "at ")
        summary_text = summary_text.replace("with weather conditions", "with")
        return summary_text

    # Limpeza de texto final
    def preprocess_text(input_text):
        input_text = re.sub(r"\s+", " ", input_text.strip())
        input_text = input_text.replace(" - ", "–")
        input_text = input_text.replace("–", "-")
        return input_text

    # Geração e exibição de resumo textual
    narrative_summary = generate_narrative_summary(structured_data)
    refined_summary = refine_summary_text(narrative_summary)
    print(refined_summary)

    # Coleta nome do usuário
    usuario = input("Digite o nome do usuário (ou pressione Enter para usar 'The object'): ").strip()
    if not usuario:
        usuario = "object"

    # Template Jinja2 para o resumo
    template_text = """
    {% for period in data %}
    In the {{ period.time_period }}, the """ + usuario + """ is usually associated with {{ period.natureza }}{% if period.weather and "100% frequency" not in period.weather %}{% endif %}.
    {% for transition in period.transitions %}
    {{ transition }}
    {% endfor %}
    {% endfor %}
    """

    # Prepara dados para preenchimento do template
    periods_data = []
    time_groups = defaultdict(list)
    for event in structured_data["trajectory_description"]:
        try:
            hour = extract_hour_from_time_string(event.get("time", ""))
            period = time_of_day(hour)
            time_groups[period].append(event)
        except Exception as e:
            print(f"Erro ao processar evento: {e}")

    for period, events in time_groups.items():
        natureza_counts = defaultdict(int)
        weather_counts = defaultdict(int)
        transitions = []

        for i, event in enumerate(events):
            natureza = event.get("NATUREZA1_DESCRICAO", "N/A")
            formatted_natureza = format_natureza(natureza)
            if formatted_natureza == "N/A":
                continue  # ignora eventos irrelevantes

            weather = event.get("WEATHER", "N/A")
            natureza_counts[natureza] += 1
            weather_counts[weather] += 1
            if i > 0:
                prev_natureza = events[i - 1].get("NATUREZA1_DESCRICAO", "N/A")
                formatted_prev = format_natureza(prev_natureza)
                formatted_curr = format_natureza(natureza)

                # Regra 1: ignorar transições N/A → N/A
                if formatted_prev == "N/A" and formatted_curr == "N/A":
                    continue
                # Regra 2: de N/A para válido OU de válido para N/A → apenas "moved to"
                elif formatted_prev == "N/A" and formatted_curr != "N/A":
                    transitions.append(f"At {event['time']}, the object moved to {formatted_curr}.")
                elif formatted_prev != "N/A" and formatted_curr == "N/A":
                    transitions.append(f"At {event['time']}, the object moved to an unknown nature.")
                # Regra 3: ambos válidos e diferentes
                elif formatted_prev != formatted_curr:
                    transitions.append(f"At {event['time']}, the object moved from {formatted_prev} to {formatted_curr}.")


        most_common_natureza = max(natureza_counts, key=natureza_counts.get)
        most_common_weather = max(weather_counts, key=weather_counts.get)
        formatted_weather = format_weather(most_common_weather)

        periods_data.append({
            "time_period": period,
            "natureza": format_natureza(most_common_natureza),
            "weather": formatted_weather,
            "transitions": transitions
        })

    template = Template(template_text)
    output_text = template.render(data=periods_data)
    print(output_text)
    return output_text

textual_descriptor(structured_data, time_of_day, format_weather, format_natureza)



In the night, the object is usually associated with {TRÂNSITO (100%) under weather conditions N/A. At 15/12/2022 00:07:46 - 15/12/2022 00:13:40, the object moved from {INVASÃO (50%), QUEIMA A CÉU ABERTO (50%) to {PATRULHA MARIA DA PENHA (40%), APOIO (40%). At 15/12/2022 00:08:23 - 15/12/2022 00:08:52, the object moved to {TRÂNSITO (50%), SUBSTÂNCIA ILÍCITA (50%). At 15/12/2022 00:09:14 - 15/12/2022 00:09:50, the object moved from {TRÂNSITO (50%), SUBSTÂNCIA ILÍCITA (50%) to {PATRULHA MARIA DA PENHA (67%). At 15/12/2022 00:09:20 - 15/12/2022 00:09:50, the object moved from {PATRULHA MARIA DA PENHA (67%) to {INVASÃO (50%), FUNDADA SUSPEITA (ABORDAGEM) (50%). At 15/12/2022 00:10:10 - 15/12/2022 00:10:48, the object moved to {PATRULHA MARIA DA PENHA (100%). At 15/12/2022 00:10:16 - 15/12/2022 00:17:25, the object moved from {PATRULHA MARIA DA PENHA (100%) to {PATRULHA MARIA DA PENHA (53%), TRÂNSITO (47%). At 15/12/2022 00:10:18, the object moved from {PATRULHA MARIA DA PENHA (53%), TRÂNSIT

'\n    \n    In the night, the usuario is usually associated with {TRÂNSITO (100%).\n    \n    At 15/12/2022 00:07:46 - 15/12/2022 00:13:40, the object moved from {INVASÃO (50%), QUEIMA A CÉU ABERTO (50%) to {PATRULHA MARIA DA PENHA (40%), APOIO (40%).\n    \n    At 15/12/2022 00:08:23 - 15/12/2022 00:08:52, the object moved to {TRÂNSITO (50%), SUBSTÂNCIA ILÍCITA (50%).\n    \n    At 15/12/2022 00:09:14 - 15/12/2022 00:09:50, the object moved from {TRÂNSITO (50%), SUBSTÂNCIA ILÍCITA (50%) to {PATRULHA MARIA DA PENHA (67%).\n    \n    At 15/12/2022 00:09:20 - 15/12/2022 00:09:50, the object moved from {PATRULHA MARIA DA PENHA (67%) to {INVASÃO (50%), FUNDADA SUSPEITA (ABORDAGEM) (50%).\n    \n    At 15/12/2022 00:10:10 - 15/12/2022 00:10:48, the object moved to {PATRULHA MARIA DA PENHA (100%).\n    \n    At 15/12/2022 00:10:16 - 15/12/2022 00:17:25, the object moved from {PATRULHA MARIA DA PENHA (100%) to {PATRULHA MARIA DA PENHA (53%), TRÂNSITO (47%).\n    \n    At 15/12/2022 00:10:18,