# Laboratorio No. 5 Threat Hunting

## Parte No. 1

In [1]:
import json
import pandas as pd
import tldextract

In [3]:
# Paso 1: Cargar la información del archivo en una lista
file_path = "large_eve.json"

with open(file_path, "r") as file:
    data = [json.loads(line) for line in file]

print(f"Total de registros: {len(data)}")

Total de registros: 746909


In [4]:
# Paso 2: Filtrar solo registros DNS
dns_records = [record for record in data if record.get("event_type") == "dns"]

In [5]:
# Paso 3: Mostrar cantidad de registros filtrados
print(f"Registros DNS filtrados: {len(dns_records)}")

Registros DNS filtrados: 15749


In [6]:
# Paso 4: Mostrar 2 registros DNS aleatorios
print("Ejemplo de registros DNS:")
for record in dns_records[:2]:
    print(json.dumps(record, indent=2))

Ejemplo de registros DNS:
{
  "timestamp": "2017-07-22T17:33:16.661646-0500",
  "flow_id": 1327836194150542,
  "pcap_cnt": 22269,
  "event_type": "dns",
  "vlan": 110,
  "src_ip": "2001:0dbb:0c18:0011:0260:6eff:fe30:0863",
  "src_port": 59680,
  "dest_ip": "2001:0500:0001:0000:0000:0000:803f:0235",
  "dest_port": 53,
  "proto": "UDP",
  "dns": {
    "type": "query",
    "id": 15529,
    "rrname": "api.wunderground.com",
    "rrtype": "A",
    "tx_id": 0
  }
}
{
  "timestamp": "2017-07-22T17:33:24.990320-0500",
  "flow_id": 2022925111925872,
  "pcap_cnt": 54352,
  "event_type": "dns",
  "vlan": 110,
  "src_ip": "2001:0dbb:0c18:0011:0260:6eff:fe30:0863",
  "src_port": 38051,
  "dest_ip": "2001:0500:0003:0000:0000:0000:0000:0042",
  "dest_port": 53,
  "proto": "UDP",
  "dns": {
    "type": "query",
    "id": 58278,
    "rrname": "stork79.dropbox.com",
    "rrtype": "A",
    "tx_id": 0
  }
}


In [7]:
# Paso 5: Normalizar la data en un DataFrame
df = pd.json_normalize(dns_records)

# Mostrar la forma del DataFrame
print(f"Shape del DataFrame: {df.shape}")

Shape del DataFrame: (15749, 18)


In [8]:
# Paso 6: Filtrar registros DNS tipo 'A' (resuelven a una IP)
df_A = df[df["dns.rrtype"] == "A"]

# Mostrar cantidad de registros filtrados
print(f"Registros tipo A: {df_A.shape[0]}")

Registros tipo A: 2849


In [9]:
# Paso 7: Filtrar dominios únicos
unique_domains = df_A["dns.rrname"].unique()
print(f"Dominios únicos: {len(unique_domains)}")

Dominios únicos: 177


In [12]:
# Paso 8: Función para obtener el TLD
def get_tld(domain):
    extracted = tldextract.extract(domain)
    return f"{extracted.domain}.{extracted.suffix}" if extracted.suffix else extracted.domain

test_domains = ["api.wunderground.com", "safebrowsing.clients.google.com.home"]
for domain in test_domains:
    print(f"TLD de {domain}: {get_tld(domain)}")

TLD de api.wunderground.com: wunderground.com
TLD de safebrowsing.clients.google.com.home: home


In [13]:
# Paso 9: Crear nueva columna en el DataFrame
df_tld = pd.DataFrame({"dns.rrname": unique_domains})
df_tld["domain_tld"] = df_tld["dns.rrname"].apply(get_tld)

# Mostrar resultado
df_tld.head()

Unnamed: 0,dns.rrname,domain_tld
0,api.wunderground.com,wunderground.com
1,stork79.dropbox.com,dropbox.com
2,hpca-tier2.office.aol.com.ad.aol.aoltw.net,aoltw.net
3,safebrowsing.clients.google.com.home,home
4,fxfeeds.mozilla.com,mozilla.com


In [14]:
df_tld.to_csv("filtered_domains.csv", index=False)

## Parte No. 2

In [50]:
import os
import google.generativeai as genai
import pandas as pd
import time
import re

In [54]:
# Configurar API Key desde .env
from dotenv import load_dotenv
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY_NEW")

genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel("models/gemini-1.5-pro")

In [55]:
# Función para clasificar dominios
def classify_domain(domain):
    prompt = f"""
Eres un experto en ciberseguridad. Clasifica el siguiente dominio como DGA (1) o legítimo (0).
Solo responde con 1 o 0. Dominio: {domain}
"""
    try:
        response = model.generate_content(prompt)
        text = response.text.strip()

        match = re.search(r'[01]', text)
        if match:
            result = int(match.group(0))
            print(f"[✔] {domain} -> {result}")
            return result
        else:
            print(f"[!] Respuesta inesperada para {domain}: '{text}'")
            return -1
    except Exception as e:
        print(f"[X] Error clasificando {domain}: {e}")
        return -1

In [56]:
# Leer CSV
df_tld = pd.read_csv("filtered_domains.csv", names=["dns.rrname", "domain_tld"])

# Filtrar dominios inválidos antes de clasificar
df_tld = df_tld[df_tld["domain_tld"].str.contains(r"\.", regex=True, na=False)]

# Clasificar dominios
df_tld["dga_label"] = df_tld["domain_tld"].apply(lambda x: classify_domain(x) if x else -1)
time.sleep(1.5)

# Filtrar solo DGA
dga_domains = df_tld[df_tld["dga_label"] == 1]
dga_domains_unique = dga_domains.drop_duplicates(subset=["domain_tld"])

# Mostrar resultados
print("Cantidad de dominios clasificados como DGA (con posibles duplicados):", dga_domains.shape[0])
print("Cantidad de dominios DGA únicos:", dga_domains_unique.shape[0])
print(dga_domains_unique.head())

[✔] wunderground.com -> 0
[✔] dropbox.com -> 0
[✔] aoltw.net -> 0
[✔] mozilla.com -> 0
[✔] metasploit.com -> 0
[✔] aol.com -> 0
[✔] aoltw.net -> 0
[✔] aol.com -> 0
[✔] aol.com -> 0
[✔] aoltw.net -> 0
[✔] aol.com -> 0
[✔] google.com -> 0
[✔] stayonline.net -> 0
[✔] aoltw.net -> 0
[✔] aol.com -> 0
[✔] aol.com -> 0
[✔] informaction.com -> 0
[✔] vmware.com -> 0
[✔] mozilla.com -> 0
[✔] windows.com -> 0
[✔] vmware.com -> 0
[✔] ntkrnlpa.info -> 0
[✔] portswigger.net -> 0
[✔] offensive-security.com -> 0
[✔] stayonline.net -> 0
[✔] stopbadware.org -> 0
[✔] aoltw.net -> 0
[✔] arrancar.org -> 0
[✔] sql-ledger.org -> 0
[✔] backtrack-linux.org -> 0
[✔] stayonline.net -> 0
[✔] mozilla.com -> 0
[✔] theanime.cn -> 0
[✔] theanime.cn -> 0
[✔] aoltw.net -> 0
[✔] aoltw.net -> 0
[✔] aoltw.net -> 0
[✔] stayonline.net -> 0
[✔] aoltw.net -> 0
[✔] aol.com -> 0
[✔] aoltw.net -> 0
[✔] phpmyadmin.net -> 0
[✔] google.com -> 0
[✔] google.com -> 0
[✔] microsoft.com -> 0
[✔] aoltw.net -> 0
[✔] comcast.net -> 0
[✔] g

In [58]:
# Guardar en CSV
dga_domains_unique.to_csv("filtered_dga_domains.csv", index=False)