In [3]:
import json
import boto3
import os
from dotenv import load_dotenv

load_dotenv()

S3_BUCKET_NAME = 'delitos-informaticos-tomaslopera'
STATE_KEY = 'metadata/state.json'
AWS_ACCESS_KEY = os.getenv('AWS_ACCESS_KEY')
AWS_SECRET_KEY = os.getenv('AWS_SECRET_KEY')

s3 = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY)

In [5]:
def create_or_update_state(last_processed_date):
    state_data = {'last_processed_date': last_processed_date}
    
    json_data = json.dumps(state_data, indent=2)
    
    s3.put_object(
        Bucket=S3_BUCKET_NAME,
        Key=STATE_KEY,
        Body=json_data,
        ContentType='application/json'
    )
    print(f"State updated: {state_data}")

def get_state():
    try:
        response = s3.get_object(Bucket=S3_BUCKET_NAME, Key=STATE_KEY)
        state_data = json.loads(response['Body'].read().decode('utf-8'))
        return state_data
    except s3.exceptions.NoSuchKey:
        print("State file doesn't exist yet")
        return None
    except Exception as e:
        print(f"Error reading state: {e}")
        return None

In [16]:
import pandas as pd
from sodapy import Socrata

client = Socrata("www.datos.gov.co", None)

results = client.get("4v6r-wu98", limit=2000)

data = pd.DataFrame.from_records(results)



In [17]:
data

Unnamed: 0,fecha_hecho,cod_depto,departamento,cod_muni,municipio,descripcion_conducta,cantidad
0,2006-05-13T00:00:00.000,11,BOGOTA D.C.,11001,BOGOTA D.C.,ARTICULO 269I. HURTO POR MEDIOS INFORMATICOS Y...,1
1,2006-10-06T00:00:00.000,11,BOGOTA D.C.,11001,BOGOTA D.C.,ARTICULO 269A. ACCESO ABUSIVO A UN SISTEMA INF...,1
2,2008-05-24T00:00:00.000,11,BOGOTA D.C.,11001,BOGOTA D.C.,ARTICULO 269A. ACCESO ABUSIVO A UN SISTEMA INF...,1
3,2009-01-01T00:00:00.000,41,HUILA,41001,NEIVA,ARTICULO 269F. VIOLACION DE DATOS PERSONALES,1
4,2009-01-02T00:00:00.000,11,BOGOTA D.C.,11001,BOGOTA D.C.,ARTICULO 269A. ACCESO ABUSIVO A UN SISTEMA INF...,1
...,...,...,...,...,...,...,...
1995,2011-06-25T00:00:00.000,11,BOGOTA D.C.,11001,BOGOTA D.C.,ARTICULO 269I. HURTO POR MEDIOS INFORMATICOS Y...,1
1996,2011-06-26T00:00:00.000,15,BOYACA,15299,GARAGOA,ARTICULO 269F. VIOLACION DE DATOS PERSONALES,1
1997,2011-06-26T00:00:00.000,11,BOGOTA D.C.,11001,BOGOTA D.C.,ARTICULO 269A. ACCESO ABUSIVO A UN SISTEMA INF...,1
1998,2011-06-26T00:00:00.000,17,CALDAS,17001,MANIZALES,ARTICULO 269F. VIOLACION DE DATOS PERSONALES,1


In [12]:
last_processed_data = data['fecha_hecho'].iloc[-1]

In [13]:
create_or_update_state(last_processed_data)

State updated: {'last_processed_date': '2011-06-27T00:00:00.000'}


In [14]:
current_state = get_state()
if current_state:
    print(f"Current last_processed_date: {current_state['last_processed_date']}")

Current last_processed_date: 2011-06-27T00:00:00.000


In [15]:
results = client.get("4v6r-wu98", where=f"fecha_hecho > '{current_state['last_processed_date']}'", limit=2000)

data = pd.DataFrame.from_records(results)

data

Unnamed: 0,fecha_hecho,cod_depto,departamento,cod_muni,municipio,descripcion_conducta,cantidad
0,2011-06-28T00:00:00.000,68,SANTANDER,68001,BUCARAMANGA,ARTICULO 269A. ACCESO ABUSIVO A UN SISTEMA INF...,1
1,2011-06-28T00:00:00.000,11,BOGOTA D.C.,11001,BOGOTA D.C.,ARTICULO 269I. HURTO POR MEDIOS INFORMATICOS Y...,1
2,2011-06-28T00:00:00.000,54,NORTE DE SANTANDER,54874,VILLA DEL ROSARIO,ARTICULO 269F. VIOLACION DE DATOS PERSONALES,1
3,2011-06-28T00:00:00.000,08,ATLANTICO,08001,BARRANQUILLA,ARTICULO 269A. ACCESO ABUSIVO A UN SISTEMA INF...,1
4,2011-06-29T00:00:00.000,68,SANTANDER,68001,BUCARAMANGA,ARTICULO 269F. VIOLACION DE DATOS PERSONALES,1
...,...,...,...,...,...,...,...
1995,2012-05-08T00:00:00.000,13,BOLIVAR,13430,MAGANGUE,ARTICULO 269I. HURTO POR MEDIOS INFORMATICOS Y...,1
1996,2012-05-08T00:00:00.000,13,BOLIVAR,13430,MAGANGUE,ARTICULO 269I. HURTO POR MEDIOS INFORMATICOS Y...,1
1997,2012-05-08T00:00:00.000,68,SANTANDER,68276,FLORIDABLANCA,ARTICULO 269I. HURTO POR MEDIOS INFORMATICOS Y...,1
1998,2012-05-08T00:00:00.000,11,BOGOTA D.C.,11001,BOGOTA D.C.,ARTICULO 269I. HURTO POR MEDIOS INFORMATICOS Y...,1
