In [None]:
!pip install pyhive sqlalchemy
!pip install thrift
!pip install thrift_sasl
!pip install sqlalchemy pyhive thrift
!pip install psycopg2-binary
!pip install mysql-connector-python
!pip install mariadb
!pip install pymysql
!pip install py4j
!pip install thrift_sasl

# Conexion postgre para crear tabla

In [None]:
import psycopg2
conn_postgre = psycopg2.connect(
    dbname='postgres',
    user='hive',
    password='password',
    host='localhost',
    port='5432'
)

In [None]:
conn_postgre.autocommit=True
cursor = conn_postgre.cursor()
query = query = """ CREATE TABLE viviendas (
    propertyCode INT PRIMARY KEY,
    url VARCHAR(255),
    address VARCHAR(255),
    size FLOAT,
    floor VARCHAR(50),
    province VARCHAR(100),
    municipality VARCHAR(100),
    district VARCHAR(100),
    price FLOAT,
    rooms INT,
    hasLift BOOLEAN,
    hasParking BOOLEAN,
    hasTerrace BOOLEAN,
    hasSwimmingPool BOOLEAN,
    hasAirConditioning BOOLEAN,
    hasGarden BOOLEAN,
    bathrooms INT
);
 """
cursor.execute(query)

# insercion datos en postgre

In [None]:
import pandas as pd
from sqlalchemy import create_engine
conn_postgre_insercion = create_engine('postgresql://hive:password@localhost:5432/postgres')

csv = pd.read_csv(r'../data/structured/vivienda.csv')

csv.to_sql(name='viviendas', con=conn_postgre_insercion, if_exists='replace', index=False)

# Conexion mariadb para crear tabla

In [None]:
import mysql.connector
conn_mariadb = mysql.connector.connect(
    user="root",
    password="my_password",
    host="localhost",
    database="mariaGESDB",
    port="3307"
)

In [None]:
conn_mariadb.autocommit=True
cursor = conn_mariadb.cursor()
query = query = """ CREATE TABLE location (
    propertyCode INT PRIMARY KEY,
    latitude FLOAT,
    longitude FLOAT
);
 """
cursor.execute(query)

# insercion datos en mariadb

In [None]:
conn_mariadb_insercion = create_engine('mariadb+pymysql://root:my_password@localhost:3307/mariaGESDB')

csv_loc = pd.read_csv(r'../data/structured/location.csv')

csv_loc.to_sql(name='location', con=conn_mariadb_insercion, if_exists='replace', index=False)

# conexion a hive y creacion de tablas para acceso a datos de postgre y mariadb

In [None]:
from pyhive import hive
try:
    conn = hive.Connection(host="hiveserver2", port=10000, username="hive", database="estructurados")
    cursor = conn.cursor()
    cursor.execute("SHOW TABLES")
    print(cursor.fetchall())
except Exception as e:
    print(f"Connection failed: {e}")

In [None]:
cursor.execute('''CREATE EXTERNAL TABLE IF NOT EXISTS VIVIENDAS_IN_POSTGRES (
    propertyCode INT,
    url STRING,
    address STRING,
    size FLOAT,
    `floor` STRING,
    province STRING,
    municipality STRING,
    district STRING,
    price FLOAT,
    rooms INT,
    hasLift BOOLEAN,
    hasParking BOOLEAN,
    hasTerrace BOOLEAN,
    hasSwimmingPool BOOLEAN,
    hasAirConditioning BOOLEAN,
    hasGarden BOOLEAN,
    bathrooms INT
)
STORED BY 'org.apache.hive.storage.jdbc.JdbcStorageHandler'
TBLPROPERTIES (
  "hive.sql.database.type" = "POSTGRES",
  "hive.sql.jdbc.url" = "jdbc:postgresql://hive4-postgres:5432/postgres",
  "hive.sql.dbcp.username" = "hive",
  "hive.sql.dbcp.password" = "password",
  "hive.sql.jdbc.driver" = "org.postgresql.Driver",
  "hive.sql.table" = "viviendas"
)
''')

In [None]:
import pandas as pd

# Ejecuta tu consulta
cursor.execute('''SELECT * FROM VIVIENDAS_IN_POSTGRES''')

# Obtén los resultados y conviértelos en un DataFrame
resultados = cursor.fetchall()
column_names = [desc[0] for desc in cursor.description]
df = pd.DataFrame(resultados, columns=column_names)

# Muestra el DataFrame en la notebook
df.head()  # Muestra las primeras filas del resultado


In [None]:
cursor.execute('''CREATE EXTERNAL TABLE IF NOT EXISTS LOCATION_IN_MARIADB (
    propertyCode INT,
    latitude STRING,
    longitude STRING
      
)
STORED BY 'org.apache.hive.storage.jdbc.JdbcStorageHandler'
TBLPROPERTIES (
  "hive.sql.database.type" = "MYSQL",
  "hive.sql.jdbc.url" = "jdbc:mariadb://hive4-mariadb:3306/mariaGESDB",
  "hive.sql.dbcp.username" = "root",
  "hive.sql.dbcp.password" = "my_password",
  "hive.sql.jdbc.driver" = "org.mariadb.jdbc.Driver",
  "hive.sql.table" = "location"
)
''')

In [None]:
import pandas as pd

# Ejecuta tu consulta
cursor.execute('''SELECT * FROM LOCATION_IN_MARIADB''')

# Obtén los resultados y conviértelos en un DataFrame
resultados = cursor.fetchall()
column_names = [desc[0] for desc in cursor.description]
df = pd.DataFrame(resultados, columns=column_names)

# Muestra el DataFrame en la notebook
df.head()  # Muestra las primeras filas del resultado

In [None]:
cursor.execute('''SELECT md.propertyCode, md.latitude, vp.haslift, vp.url
FROM LOCATION_IN_MARIADB as md 
INNER JOIN VIVIENDAS_IN_POSTGRES as vp 
ON md.propertyCode = vp.propertyCode ''')

# Obtén los resultados y conviértelos en un DataFrame
resultados = cursor.fetchall()
column_names = [desc[0] for desc in cursor.description]
df = pd.DataFrame(resultados, columns=column_names)

df.head()


# DATOS NO ESTRUCTURADOS

In [None]:
!pip install pandas

In [None]:
!pip install elasticsearch
!pip install langdetect
!pip install googletrans
!pip install deep-translator
!pip install pymongo


In [None]:
import pprint
import json
from elasticsearch import Elasticsearch

In [None]:
import json
from deep_translator import GoogleTranslator

# Cargar el JSON
with open('../data/unstructured/archivo.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Traducir las descripciones en inglés a español
for item in data:
    description = item.get("description", "")
    if description:
        # Traducir directamente desde inglés a español
        try:
            translation = GoogleTranslator(source='en', target='es').translate(description)
            item["description"] = translation
            print(f'Description "{description}" ha sido traducida.')
        except Exception as e:
            print(f"Error al traducir '{description}': {e}")

# Guardar el JSON traducido
with open('../data/unstructured/archivo_traducido.json', 'w', encoding='utf-8') as file:
    json.dump(data, file, ensure_ascii=False, indent=4)

print("Traducción completada. Archivo guardado como 'archivo_traducido.json'.")


In [None]:
import unicodedata

def quitar_tildes(texto):
    # Descomponer los caracteres en su forma base
    texto_normalizado = unicodedata.normalize('NFD', texto)
    # Filtrar y eliminar caracteres diacríticos
    texto_sin_tildes = ''.join(
        char for char in texto_normalizado 
        if unicodedata.category(char) != 'Mn' and char not in '¡!¿?²³¨ºª'
    )
    return texto_sin_tildes.lower()

In [None]:
# Función para recorrer el JSON y aplicar quitar_tildes a cada descripción
def procesar_json(json_data):
    for item in json_data:
        if "description" in item:
            if item["description"]:
                item["description"] = quitar_tildes(item["description"])
    return json_data

# Ejemplo de uso con un JSON cargado desde un archivo
with open('../data/unstructured/archivo_traducido.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Procesar el JSON
data_procesada = procesar_json(data)

# Guardar el JSON procesado en un nuevo archivo
with open('../data/unstructured/archivo_procesado.json', 'w', encoding='utf-8') as file:
    json.dump(data_procesada, file, ensure_ascii=False, indent=4)

Hackathon

In [None]:
# Leer el archivo JSON original
with open("../data/unstructured/archivo_procesado.json", "r") as file:
    documents = json.load(file)


In [None]:
INDEX_NAME = "descripciones_index"

In [None]:
es = Elasticsearch("http://elasticsearch:9200")

In [None]:
mapping = {
     "mappings": {
         "properties": {
             "propertyCode": { "type": "text" }, 
             "description": { "type": "text" }
         }
     }
}
if not es.indices.exists(index=INDEX_NAME):
 es.indices.create(index=INDEX_NAME, body=mapping)
 print(f"Index '{INDEX_NAME}' created with mapping.")
else:
 print(f"Index '{INDEX_NAME}' already exists.")

In [None]:
es.indices.delete(index=INDEX_NAME)

In [None]:
from elasticsearch import Elasticsearch, helpers
import json

for doc in documents:
    res = es.index(index=INDEX_NAME, document=doc)
    print(res)

In [None]:
res = es.search(index=INDEX_NAME, body={
    'query': {
        'match': {
            'description': 'codiciado'
        }
    }
})

print(json.dumps(res.body, indent=4))

DATOS ENLAZADOS

In [None]:
!pip install SPARQLWrapper

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

# Configura el endpoint SPARQL de Wikidata
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(JSON)

# Define la consulta SPARQL
query = """
SELECT ?item ?itemLabel ?description ?population ?coordinate
WHERE {
  wd:Q2807 wdt:P31 wd:Q515;             # Madrid como una instancia de ciudad (Q515)
           rdfs:label ?itemLabel;        # Nombre de la ciudad
           schema:description ?description;  # Descripción
           wdt:P1082 ?population;        # Población
           wdt:P625 ?coordinate.         # Coordenadas geográficas
  FILTER (LANG(?itemLabel) = "es" && LANG(?description) = "es")
}
LIMIT 1
"""

# Ejecuta la consulta
sparql.setQuery(query)
results = sparql.query().convert()

# Procesa y muestra los resultados
for result in results["results"]["bindings"]:
    print(f"Nombre: {result['itemLabel']['value']}")
    print(f"Descripción: {result['description']['value']}")
    print(f"Población: {result['population']['value']}")
    print(f"Coordenadas: {result['coordinate']['value']}")