# **Working With Semi Structured Data**



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


***Extracting Data From an XML File***

In [2]:
import xml.etree.ElementTree as ET
import pandas as pd

# Parse the XML file
tree = ET.parse('/content/drive/My Drive/Data Source/books.xml')
root = tree.getroot()

# Extract book data
books = []
for book in root.findall('book'):
    book_data = {
        'id': book.get('id'),
        'author': book.find('author').text,
        'title': book.find('title').text,
        'genre': book.find('genre').text,
        'price': float(book.find('price').text),
        'publish_date': book.find('publish_date').text,
        'description': book.find('description').text.strip()
    }
    books.append(book_data)

# Create a DataFrame
df1 = pd.DataFrame(books)

# Display the DataFrame
df1

Unnamed: 0,id,author,title,genre,price,publish_date,description
0,bk101,"Gambardella, Matthew",XML Developer's Guide,Computer,44.95,2000-10-01,An in-depth look at creating applications \n ...
1,bk102,"Ralls, Kim",Midnight Rain,Fantasy,5.95,2000-12-16,"A former architect battles corporate zombies, ..."
2,bk103,"Corets, Eva",Maeve Ascendant,Fantasy,5.95,2000-11-17,After the collapse of a nanotechnology \n ...
3,bk104,"Corets, Eva",Oberon's Legacy,Fantasy,5.95,2001-03-10,"In post-apocalypse England, the mysterious \n ..."
4,bk105,"Corets, Eva",The Sundered Grail,Fantasy,5.95,2001-09-10,"The two daughters of Maeve, half-sisters, \n ..."
5,bk106,"Randall, Cynthia",Lover Birds,Romance,4.95,2000-09-02,When Carla meets Paul at an ornithology \n ...
6,bk107,"Thurman, Paula",Splish Splash,Romance,4.95,2000-11-02,A deep sea diver finds true love twenty \n ...
7,bk108,"Knorr, Stefan",Creepy Crawlies,Horror,4.95,2000-12-06,"An anthology of horror stories about roaches,\..."
8,bk109,"Kress, Peter",Paradox Lost,Science Fiction,6.95,2000-11-02,After an inadvertant trip through a Heisenberg...
9,bk110,"O'Brien, Tim",Microsoft .NET: The Programming Bible,Computer,36.95,2000-12-09,Microsoft's .NET initiative is explored in \n ...


***Extracting Data From a json File***

In [3]:
import pandas as pd
import json

# Load the JSON file
file_path = '/content/drive/My Drive/Data Source/countries-table.json'
with open(file_path, 'r') as file:
    data = json.load(file)

# Convert the JSON data into a DataFrame
df2 = pd.DataFrame(data)
df2

Unnamed: 0,place,pop1980,pop2000,pop2010,pop2022,pop2023,pop2030,pop2050,country,area,landAreaKm,cca2,cca3,netChange,growthRate,worldPercentage,density,densityMi,rank
0,356,696828385.0,1.059634e+09,1.240614e+09,1.417173e+09,1.428628e+09,1.514994e+09,1.670491e+09,India,3287590.00,2973190.00,IN,IND,0.4184,0.0081,0.1785,480.5033,1244.5036,1
1,156,982372466.0,1.264099e+09,1.348191e+09,1.425887e+09,1.425671e+09,1.415606e+09,1.312636e+09,China,9706961.00,9424702.90,CN,CHN,-0.0113,-0.0002,0.1781,151.2696,391.7884,2
2,840,223140018.0,2.823986e+08,3.111828e+08,3.382899e+08,3.399966e+08,3.521623e+08,3.753920e+08,United States,9372610.00,9147420.00,US,USA,0.0581,0.0050,0.0425,37.1686,96.2666,3
3,360,148177096.0,2.140724e+08,2.440162e+08,2.755013e+08,2.775341e+08,2.921501e+08,3.172252e+08,Indonesia,1904569.00,1877519.00,ID,IDN,0.0727,0.0074,0.0347,147.8196,382.8528,4
4,586,80624057.0,1.543699e+08,1.944545e+08,2.358249e+08,2.404857e+08,2.740298e+08,3.678085e+08,Pakistan,881912.00,770880.00,PK,PAK,0.1495,0.0198,0.0300,311.9625,807.9829,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,500,11452.0,5.138000e+03,4.938000e+03,4.390000e+03,4.386000e+03,4.301000e+03,3.781000e+03,Montserrat,102.00,102.00,MS,MSR,,-0.0009,,43.0000,111.3700,230
230,238,2240.0,3.080000e+03,3.187000e+03,3.780000e+03,3.791000e+03,3.869000e+03,3.779000e+03,Falkland Islands,12173.00,12173.00,FK,FLK,,0.0029,,0.3114,0.8066,231
231,570,3637.0,2.074000e+03,1.812000e+03,1.934000e+03,1.935000e+03,1.948000e+03,2.096000e+03,Niue,261.00,261.00,NU,NIU,0.0000,0.0005,,7.4138,19.2017,232
232,772,1647.0,1.666000e+03,1.367000e+03,1.871000e+03,1.893000e+03,2.046000e+03,2.430000e+03,Tokelau,12.00,10.00,TK,TKL,,0.0118,,189.3000,490.2870,233


***Extracting Data From a yaml File***

In [14]:
import pandas as pd
import yaml

df3 = pd.DataFrame(yaml.safe_load(open("/content/drive/MyDrive/Data Source/ucm_2025.yaml")))
df3

Unnamed: 0,Minister,Party,Portfolio
0,Narendra Modi,BJP,Prime Minister
1,Narendra Modi,BJP,"Minister of Personnel, Public Grievances and P..."
2,Narendra Modi,BJP,Department of Atomic Energy
3,Narendra Modi,BJP,Department of Space
4,Rajnath Singh,BJP,Minister of Defence
5,Amit Shah,BJP,Minister of Home Affairs
6,Amit Shah,BJP,Minister of Co-operation
7,Nitin Gadkari,BJP,Minister of Road Transport and Highways
8,Jagat Prakash Nadda,BJP,Minister of Health and Family Welfare
9,Jagat Prakash Nadda,BJP,Minister of Chemicals and Fertilizers
