# Working With The XML Format

In [1]:
!pip install CyberHerder==1.0.7

Defaulting to user installation because normal site-packages is not writeable


In [2]:
from CyberHerder import dataBreach

dataBreach(amount = 1000)

True

In [5]:
import os
import copy
import xml.etree.ElementTree as ET

user_home_environment = os.path.expanduser('~') + '/'
breached_data = user_home_environment + 'BreachedData/'

In [6]:
!ls $breached_data | grep .*xml

creditCards.xml
devices.xml
people.xml


## Credit Cards

In [40]:
tree = ET.parse(f'{breached_data}creditCards.xml')
root = tree.getroot()

print(f'The root tag is: {root.tag}')
print("The root has the following children")
print()

credit_card_dataset = []

count = 0
for child in root:
    if count < 3:
        print(child.tag)    
        print(f'Name: {child.attrib["Name"]}')
        print(f'Credit Card: {child.attrib["Credit_Card_Information"]}')
        print(f'Pin Number: {child.attrib["Pin_Number"]}')
        print()
        count += 1
    credit_card_dataset.append(list(child.attrib.values()))

The root tag is: Credit_Information
The root has the following children

Personally_Identifiable_Information
Name: CONWAY
Credit Card: 2239357568782955
Pin Number: 165711

Personally_Identifiable_Information
Name: WILEY
Credit Card: 3948383992138991
Pin Number: 169419

Personally_Identifiable_Information
Name: LEACH
Credit Card: 9407358302862132
Pin Number: 844309



## Person Data

In [41]:
tree = ET.parse(f'{breached_data}people.xml')
root = tree.getroot()

print(f'The root tag is: {root.tag}')
print("The root has the following children")
print()

people_dataset = []

count = 0
for child in root:
    if count == 3:
        print(child.tag)

        print(f'Username: {child.attrib["Username"]}')
        print(f'Company: {child.attrib["Company"]}')
        print(f'Phone Number: {child.attrib["Phone_Number"]}')
        print(f'UID: {child.attrib["UID"]}')
        print(f'Zip Code: {child.attrib["Zip_Code"]}')
        print(f'Email: {child.attrib["Email"]}')
        print()
        count += 1
        
    people_dataset.append(list(child.attrib.values()))

The root tag is: People
The root has the following children



## CSV Module Insight
- Tie the employee dataset together: Credit_Information and People datasets

In [43]:
print(credit_card_dataset[:2])
print(people_dataset[:2])

print(len(credit_card_dataset))

[['CONWAY', '2239357568782955', '165711'], ['WILEY', '3948383992138991', '169419']]
[['CONWAY', 'Cybersploit', '1179720068', 'czuyAmVWrM', '713054', 'ROTH@cybersploit.edu'], ['WILEY', 'Liberty-Crypto', '7505337916', 'qIs7EW4l1K', '821335', 'HATFIELD@cybersploit.org']]
1000


In [51]:
employee_dataset = {"headers": ["Company", "Phone_Number", "UID", "Zip_Code", "Email","Credit_Card_Information", "Pin_Number"]}

if len(credit_card_dataset) != len(people_dataset):
    raise Exception("Datasets must be equal to merge!")
    
idx = 0

while idx < len(credit_card_dataset):
    person = people_dataset[idx]
    credit_information = credit_card_dataset[idx]
    
    if person[0] != credit_information[0]:
        raise Exception(f'At index {idx}, the two datasets do not map')
    else:
        employee_dataset[person[0]] = copy.deepcopy(person[1:])
        employee_dataset[person[0]].extend(copy.deepcopy(credit_information[1:]))
    
    idx += 1

In [53]:
print(f'Merged headers: {employee_dataset["headers"]}')

Merged headers: ['Company', 'Phone_Number', 'UID', 'Zip_Code', 'Email', 'Credit_Card_Information', 'Pin_Number']


In [52]:
print('First five rows of the employee dataset. Order may change across instances!')

count = 0

for key, val in employee_dataset.items():
    if key != 'headers':
        print(f'Name: {key}')
        print(f'Data: {val}')
        count += 1
        
    if count == 5:
        break

First five rows of the employee dataset. Order may change across instances!
Name: CONWAY
Data: ['Cybersploit', '1179720068', 'czuyAmVWrM', '713054', 'ROTH@cybersploit.edu', '2239357568782955', '165711']
Name: WILEY
Data: ['Liberty-Crypto', '5961983311', 'lVIxsiutdy', '271410', 'MCCALL@cybersploit.org', '2768547065573344', '248547']
Name: LEACH
Data: ['Liberty-Crypto', '8677257927', 'rY4MA5VBwL', '411361', 'FULLER@libertycrypto.edu', '9836726830716642', '044560']
Name: SHARP
Data: ['Sigcryptal', '9566401280', 'p3TLfwduxQ', '759274', 'HYDE@sigcryptal.edu', '9549887639360851', '777027']
Name: SCHMIDT
Data: ['Cybersploit', '6845813820', 'mCl83jkUWe', '265700', 'COOPER@sigcryptal.com', '0925563288112714', '439982']
