# Laboratorio No. 5 Threat Hunting

## Parte No. 1

In [1]:
import json
import pandas as pd
import tldextract

In [3]:
# Paso 1: Cargar la información del archivo en una lista
file_path = "large_eve.json"

with open(file_path, "r") as file:
    data = [json.loads(line) for line in file]

print(f"Total de registros: {len(data)}")

Total de registros: 746909


In [4]:
# Paso 2: Filtrar solo registros DNS
dns_records = [record for record in data if record.get("event_type") == "dns"]

In [5]:
# Paso 3: Mostrar cantidad de registros filtrados
print(f"Registros DNS filtrados: {len(dns_records)}")

Registros DNS filtrados: 15749


In [6]:
# Paso 4: Mostrar 2 registros DNS aleatorios
print("Ejemplo de registros DNS:")
for record in dns_records[:2]:
    print(json.dumps(record, indent=2))

Ejemplo de registros DNS:
{
  "timestamp": "2017-07-22T17:33:16.661646-0500",
  "flow_id": 1327836194150542,
  "pcap_cnt": 22269,
  "event_type": "dns",
  "vlan": 110,
  "src_ip": "2001:0dbb:0c18:0011:0260:6eff:fe30:0863",
  "src_port": 59680,
  "dest_ip": "2001:0500:0001:0000:0000:0000:803f:0235",
  "dest_port": 53,
  "proto": "UDP",
  "dns": {
    "type": "query",
    "id": 15529,
    "rrname": "api.wunderground.com",
    "rrtype": "A",
    "tx_id": 0
  }
}
{
  "timestamp": "2017-07-22T17:33:24.990320-0500",
  "flow_id": 2022925111925872,
  "pcap_cnt": 54352,
  "event_type": "dns",
  "vlan": 110,
  "src_ip": "2001:0dbb:0c18:0011:0260:6eff:fe30:0863",
  "src_port": 38051,
  "dest_ip": "2001:0500:0003:0000:0000:0000:0000:0042",
  "dest_port": 53,
  "proto": "UDP",
  "dns": {
    "type": "query",
    "id": 58278,
    "rrname": "stork79.dropbox.com",
    "rrtype": "A",
    "tx_id": 0
  }
}


In [7]:
# Paso 5: Normalizar la data en un DataFrame
df = pd.json_normalize(dns_records)

# Mostrar la forma del DataFrame
print(f"Shape del DataFrame: {df.shape}")

Shape del DataFrame: (15749, 18)


In [8]:
# Paso 6: Filtrar registros DNS tipo 'A' (resuelven a una IP)
df_A = df[df["dns.rrtype"] == "A"]

# Mostrar cantidad de registros filtrados
print(f"Registros tipo A: {df_A.shape[0]}")

Registros tipo A: 2849


In [9]:
# Paso 7: Filtrar dominios únicos
unique_domains = df_A["dns.rrname"].unique()
print(f"Dominios únicos: {len(unique_domains)}")

Dominios únicos: 177


In [12]:
# Paso 8: Función para obtener el TLD
def get_tld(domain):
    extracted = tldextract.extract(domain)
    return f"{extracted.domain}.{extracted.suffix}" if extracted.suffix else extracted.domain

test_domains = ["api.wunderground.com", "safebrowsing.clients.google.com.home"]
for domain in test_domains:
    print(f"TLD de {domain}: {get_tld(domain)}")

TLD de api.wunderground.com: wunderground.com
TLD de safebrowsing.clients.google.com.home: home


In [13]:
# Paso 9: Crear nueva columna en el DataFrame
df_tld = pd.DataFrame({"dns.rrname": unique_domains})
df_tld["domain_tld"] = df_tld["dns.rrname"].apply(get_tld)

# Mostrar resultado
df_tld.head()

Unnamed: 0,dns.rrname,domain_tld
0,api.wunderground.com,wunderground.com
1,stork79.dropbox.com,dropbox.com
2,hpca-tier2.office.aol.com.ad.aol.aoltw.net,aoltw.net
3,safebrowsing.clients.google.com.home,home
4,fxfeeds.mozilla.com,mozilla.com


In [14]:
df_tld.to_csv("filtered_domains.csv", index=False)