# IO tools
https://pandas.pydata.org/docs/user_guide/io.html#

In [115]:
import pandas as pd
import numpy as np

### CSV

In [39]:
pd.read_csv("../datasets/small.csv").head()

Unnamed: 0,athlete_id,name,sex
0,65649.0,Ivanka Bonova,Female
1,112510.0,Nataliya Uryadova,Female
2,114973.0,Essa Ismail Rashed,Male
3,,,
4,30359.0,Péter Boros,Male


### Parameters for read_csv():

#### List of column names to use.

In [40]:
pd.read_csv("../datasets/small_no_headers.csv", names=["id", "name", "m/f"]).head()

Unnamed: 0,id,name,m/f
0,65649.0,Ivanka Bonova,Female
1,112510.0,Nataliya Uryadova,Female
2,114973.0,Essa Ismail Rashed,Male
3,,,
4,30359.0,Péter Boros,Male


#### Skip the first row and use new column names.

In [41]:
pd.read_csv("../datasets/small.csv", names=["id", "name", "m/f"], skiprows=1).head()

Unnamed: 0,id,name,m/f
0,65649.0,Ivanka Bonova,Female
1,112510.0,Nataliya Uryadova,Female
2,114973.0,Essa Ismail Rashed,Male
3,,,
4,30359.0,Péter Boros,Male


#### The delimiter to use. Default is ','.

In [42]:
pd.read_csv("../datasets/small.csv", sep=",").head()

Unnamed: 0,athlete_id,name,sex
0,65649.0,Ivanka Bonova,Female
1,112510.0,Nataliya Uryadova,Female
2,114973.0,Essa Ismail Rashed,Male
3,,,
4,30359.0,Péter Boros,Male


#### Column(s) to set as the index.

In [38]:
pd.read_csv("../datasets/small.csv", index_col=0)

Unnamed: 0_level_0,name,sex
athlete_id,Unnamed: 1_level_1,Unnamed: 2_level_1
65649.0,Ivanka Bonova,Female
112510.0,Nataliya Uryadova,Female
114973.0,Essa Ismail Rashed,Male
,,
30359.0,Péter Boros,Male
146111.0,Svetlana Kholomina,Female


#### Return a subset of the columns.

In [37]:
pd.read_csv("../datasets/small.csv", usecols=["name"], index_col=0)

Ivanka Bonova
Nataliya Uryadova
Essa Ismail Rashed
""
Péter Boros
Svetlana Kholomina


#### Additional strings to recognize as NA/NaN.

In [53]:
# pandas will treat any occurrence of 'NA', 'null' or 'N/A' in the CSV file as a missing value and convert it to NaN.
pd.read_csv("../datasets/small.csv", na_values=["NA", "N/A", "null"])

Unnamed: 0,athlete_id,name,sex
0,65649.0,Ivanka Bonova,Female
1,112510.0,Nataliya Uryadova,Female
2,114973.0,Essa Ismail Rashed,Male
3,,,
4,30359.0,Péter Boros,Male
5,146111.0,Svetlana Kholomina,Female


#### Data type for data or columns. E.g., {'col1': int, 'col2': float}.

In [51]:
pd.read_csv("../datasets/small_clean.csv", dtype={"athlete_id": int, "name": str})

Unnamed: 0,athlete_id,name,sex
0,65649,Ivanka Bonova,Female
1,112510,Nataliya Uryadova,Female
2,114973,Essa Ismail Rashed,Male
3,30359,Péter Boros,Male
4,146111,Svetlana Kholomina,Female


#### Number of rows to read from the file.

In [52]:
pd.read_csv("../datasets/small_clean.csv", nrows=2)

Unnamed: 0,athlete_id,name,sex
0,65649,Ivanka Bonova,Female
1,112510,Nataliya Uryadova,Female


#### Return a TextFileReader object for iteration.

1. Define process Function: The process function takes a chunk of data as input and prints it along with its data types.
2. Read CSV in Chunks: The pd.read_csv function reads the CSV file in chunks of 1000 rows.
3. Iterate and Process: The for loop iterates over each chunk and calls the process function.

In [68]:
# Define the process function
def process(chunk):
    print(chunk)
    print(chunk.dtypes)

# Return a TextFileReader object for iteration
chunk_iter = pd.read_csv("../datasets/small_clean.csv", chunksize=2)

# Iterate over each chunk and process it
for chunk in chunk_iter:
    process(chunk)

   athlete_id               name     sex        date
0       65649      Ivanka Bonova  Female  2023-04-03
1      112510  Nataliya Uryadova  Female  2022-01-02
athlete_id     int64
name          object
sex           object
date          object
dtype: object
   athlete_id                name   sex        date
2      114973  Essa Ismail Rashed  Male  2024-01-01
3       30359         Péter Boros  Male  2023-01-02
athlete_id     int64
name          object
sex           object
date          object
dtype: object
   athlete_id                name     sex        date
4      146111  Svetlana Kholomina  Female  2023-05-01
athlete_id     int64
name          object
sex           object
date          object
dtype: object


You can customize the process function to perform any operation you need, such as data cleaning, transformation, or saving to a database.
In this example, the process function filters rows where the athlete_id column is greater than 100 and prints the filtered chunk.

In [69]:
# Define the process function
def process(chunk):
    # Example: Filter rows where 'value' column is greater than 100
    filtered_chunk = chunk[chunk["athlete_id"] > 100]
    print(filtered_chunk)


# Return a TextFileReader object for iteration
chunk_iter = pd.read_csv("../datasets/small_clean.csv", chunksize=2)

# Iterate over each chunk and process it
for chunk in chunk_iter:
    process(chunk)

   athlete_id               name     sex        date
1         120  Nataliya Uryadova  Female  2022-01-02
   athlete_id         name   sex        date
3         500  Péter Boros  Male  2023-01-02
   athlete_id                name     sex        date
4         102  Svetlana Kholomina  Female  2023-05-01


#### Read the CSV file and parse the 'date' column as dates.

In [60]:
pd.read_csv("../datasets/small_clean.csv", parse_dates=["date"])

Unnamed: 0,athlete_id,name,sex,date
0,65649,Ivanka Bonova,Female,2023-04-03
1,112510,Nataliya Uryadova,Female,2022-01-02
2,114973,Essa Ismail Rashed,Male,2024-01-01
3,30359,Péter Boros,Male,2023-01-02
4,146111,Svetlana Kholomina,Female,2023-05-01


#### Parsing Dates with Custom Format

In [64]:
from datetime import datetime

# Custom date parser function
date_parser = lambda x: datetime.strptime(x, "%Y-%m-%d")

# Read the CSV file with custom date parser
df = pd.read_csv("../datasets/small_clean.csv", parse_dates=["date"], date_parser=date_parser)

print(df)
print(df.dtypes)

   athlete_id                name     sex       date
0       65649       Ivanka Bonova  Female 2023-04-03
1      112510   Nataliya Uryadova  Female 2022-01-02
2      114973  Essa Ismail Rashed    Male 2024-01-01
3       30359         Péter Boros    Male 2023-01-02
4      146111  Svetlana Kholomina  Female 2023-05-01
athlete_id             int64
name                  object
sex                   object
date          datetime64[ns]
dtype: object


  df = pd.read_csv("../datasets/small_clean.csv", parse_dates=["date"], date_parser=date_parser)


---

### TXT

In [72]:
pd.read_table("../datasets/small.txt", sep=",")

Unnamed: 0,athlete_id,name,sex,date
0,50,Ivanka Bonova,Female,2023-04-03
1,120,Nataliya Uryadova,Female,2022-01-02
2,70,Essa Ismail Rashed,Male,2024-01-01
3,500,Péter Boros,Male,2023-01-02
4,102,Svetlana Kholomina,Female,2023-05-01


---

### JSON

In [89]:
# Reading from a Json string.
from io import StringIO

#### Read from Json.

In [81]:
# Reading from a Json file.
pd.read_json("../datasets/small.json")

Unnamed: 0,athlete_id,name,sex,date
0,50,Ivanka Bonova,Female,2023-04-03
1,120,Nataliya Uryadova,Female,2022-01-02
2,70,Essa Ismail Rashed,Male,2024-01-01
3,500,Péter Boros,Male,2023-01-02
4,102,Svetlana Kholomina,Female,2023-05-01


In [90]:
json_str = '''
[
    {
        "athlete_id": 50,
        "name": "Ivanka Bonova",
        "sex": "Female",
        "date": "2023-04-03"
    },
    {
        "athlete_id": 120,
        "name": "Nataliya Uryadova",
        "sex": "Female",
        "date": "2022-01-02"
    }
]
'''

# Use StringIO to create a file-like object from the string
json_data = StringIO(json_str)

# Read the JSON data into a pandas DataFrame
df = pd.read_json(json_data)

print(df)

   athlete_id               name     sex       date
0          50      Ivanka Bonova  Female 2023-04-03
1         120  Nataliya Uryadova  Female 2022-01-02


#### json_normalize()

The `pd.json_normalize()` function is used **to flatten nested JSON data into a pandas DataFrame**. To normalize a JSON structure that includes nested lists and dictionaries.

1. `data`: This is the input JSON data, which is a list of dictionaries. Each dictionary represents a state with nested information about counties and other attributes.

2. `"county"`: The function iterates over each item in the county list, creating a row for each county.

3. `["state", "shortname", ["info", "governor"]]`: These fields are added to each row, pulling their values from the parent dictionary.

In [84]:
data = [
    {
        "state": "Florida",
        "shortname": "FL",
        "info": {"governor": "Rick Scott"},
        "county": [
            {"name": "Dade", "population": 12345},
            {"name": "Broward", "population": 40000},
            {"name": "Palm Beach", "population": 60000},
        ],
    },
    {
        "state": "Ohio",
        "shortname": "OH",
        "info": {"governor": "John Kasich"},
        "county": [
            {"name": "Summit", "population": 1234},
            {"name": "Cuyahoga", "population": 1337},
        ],
    },
]

df = pd.json_normalize(data, "county", ["state", "shortname", ["info", "governor"]])
print(df)

         name  population    state shortname info.governor
0        Dade       12345  Florida        FL    Rick Scott
1     Broward       40000  Florida        FL    Rick Scott
2  Palm Beach       60000  Florida        FL    Rick Scott
3      Summit        1234     Ohio        OH   John Kasich
4    Cuyahoga        1337     Ohio        OH   John Kasich


The `pd.json_normalize()` function is used to flatten this nested structure. 

The `max_level=1` parameter specifies that the normalization should only go **one level deep**. This means that only the first level of nested dictionaries will be flattened.

In [85]:
data = [
    {
        "CreatedBy": {"Name": "User001"},
        "Lookup": {
            "TextField": "Some text",
            "UserField": {"Id": "ID001", "Name": "Name001"},
        },
        "Image": {"a": "b"},
    }
]

df = pd.json_normalize(data, max_level=1)
print(df)

  CreatedBy.Name Lookup.TextField                    Lookup.UserField Image.a
0        User001        Some text  {'Id': 'ID001', 'Name': 'Name001'}       b


In [88]:
data = [
    {
        "CreatedBy": {"Name": "User001"},
        "Lookup": {
            "TextField": "Some text",
            "UserField": {"Id": "ID001", "Name": "Name001"},
        },
        "Image": {"a": "b"},
    }
]

df = pd.json_normalize(data, max_level=2)
print(df)

  CreatedBy.Name Lookup.TextField Lookup.UserField.Id Lookup.UserField.Name  \
0        User001        Some text               ID001               Name001   

  Image.a  
0       b  


#### Line-Delimited JSON (JSONL)

Line-delimited JSON files contain one JSON object per line. This format is useful for processing large datasets or streaming data.

You can read JSONL files/strings using `pd.read_json` with the `lines=True` parameter:

In [93]:
jsonl = """
{"a": 1, "b": 2}
{"a": 3, "b": 4}
"""

df = pd.read_json(StringIO(jsonl), lines=True)
print(df)

   a  b
0  1  2
1  3  4


You can write a DataFrame to a JSONL file using `df.to_json` with `orient="records"` and `lines=True`:

In [94]:
df.to_json(orient="records", lines=True)

'{"a":1,"b":2}\n{"a":3,"b":4}\n'

Reading JSONL in chunks

For large files, you can read JSONL in chunks using the `chunksize` parameter:

In [95]:
with pd.read_json(StringIO(jsonl), lines=True, chunksize=1) as reader:
    for chunk in reader:
        print(chunk)

Empty DataFrame
Columns: []
Index: []
   a  b
0  1  2
   a  b
1  3  4


#### Write to Json.

`orient='records'`: the DataFrame is converted to a list of dictionaries, where each dictionary represents a row in the DataFrame.

`indent=4`: When set to 4, each level of the JSON structure is indented by 4 spaces.

In [96]:
# Hardcoded data similar to small.txt
data = {
    "athlete_id": [50, 120, 70],
    "name": [
        "Ivanka Bonova",
        "Nataliya Uryadova",
        "Essa Ismail Rashed",
    ],
    "sex": ["Female", "Female", "Male"],
    "date": ["2023-04-03", "2022-01-02", "2024-01-01"],
}

# Create a DataFrame
df = pd.DataFrame(data)

# Convert the DataFrame to a JSON string
json_str = df.to_json(orient="records", indent=4)

# Print the JSON string
print(json_str)

[
    {
        "athlete_id":50,
        "name":"Ivanka Bonova",
        "sex":"Female",
        "date":"2023-04-03"
    },
    {
        "athlete_id":120,
        "name":"Nataliya Uryadova",
        "sex":"Female",
        "date":"2022-01-02"
    },
    {
        "athlete_id":70,
        "name":"Essa Ismail Rashed",
        "sex":"Male",
        "date":"2024-01-01"
    }
]


#### Table Schema

Table Schema is a specification for describing tabular data as JSON. It includes metadata like field names, types, and other attributes.

Creating a DataFrame and converting to JSON with schema:

In [97]:
df = pd.DataFrame(
    {
        "A": [1, 2, 3],
        "B": ["a", "b", "c"],
        "C": ["2016-01-01", "2016-01-02", "2016-01-03"],
    },
    index=pd.Index([0, 1, 2], name="idx"), # specifies the index (or row labels) for the DataFrame.
)

json_str = df.to_json(orient="table", date_format="iso")
print(json_str)

{"schema":{"fields":[{"name":"idx","type":"integer"},{"name":"A","type":"integer"},{"name":"B","type":"string"},{"name":"C","type":"datetime"}],"primaryKey":["idx"],"pandas_version":"1.4.0"},"data":[{"idx":0,"A":1,"B":"a","C":"2016-01-01T00:00:00.000"},{"idx":1,"A":2,"B":"b","C":"2016-01-02T00:00:00.000"},{"idx":2,"A":3,"B":"c","C":"2016-01-03T00:00:00.000"}]}


Reading the JSON string back into a DataFrame:

In [98]:
new_df = pd.read_json(json_str, orient="table")
print(new_df)

     A  B          C
idx                 
0    1  a 2016-01-01
1    2  b 2016-01-02
2    3  c 2016-01-03


  new_df = pd.read_json(json_str, orient="table")


---

### EXCEL

In [75]:
# Reads the first sheet of the Excel file
pd.read_excel("../datasets/small.xlsx")

Unnamed: 0,athlete_id,name,sex,date
0,50,jax,Male,2023-04-03
1,10,Stef,Female,2021-05-03
2,110,Steffi,Male,2023-06-02
3,600,Yuna,Female,2024-04-03
4,1,Rob,Male,2022-04-03


In [76]:
# Specify the sheet name
pd.read_excel("../datasets/small.xlsx", sheet_name="Sheet2")

Unnamed: 0,athlete_id,name,sex,date
0,550,jax,Male,2025-04-03
1,110,Stef,Female,2021-05-03
2,10,Steffi,Male,2023-06-02
3,60,Yuna,Female,2024-04-03
4,51,Rob,Male,2022-04-03


---

### HTML

#### To parse HTML tables into pandas DataFrames.


1. **HTML Parsing**:
`pd.read_html()` uses an HTML parser (like `lxml` or `html5lib`) to parse the HTML content.
It scans the HTML for `<table>` tags, which denote the start of an HTML table.

2. **Table Extraction**:
Once it identifies a `<table>` tag, it reads the content within the table, including rows (`<tr>`) and cells (`<td>` or `<th>`).
It converts this content into a pandas DataFrame.

3. **Returning DataFrames**:
The function returns a list of DataFrames, with each DataFrame corresponding to a table found in the HTML content.

**Passing headers in an HTTP request can be necessary for several reasons:**

**User-Agent**: Some websites block requests that don't come from a recognized browser. By setting the User-Agent header, you can mimic a request from a browser, which can help avoid being blocked.

**Authentication**: If the resource is behind a paywall or requires authentication, you might need to pass headers like Authorization to provide a token or credentials.

**Custom Headers**: Some APIs or web services require custom headers to be set for the request to be processed correctly.

**Rate Limiting**: Some services use headers to manage rate limiting or to track usage.

In [103]:
# Basic usage
url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list"
dfs = pd.read_html(url)

# Passing headers
headers = {
    "User-Agent": "Mozilla Firefox v14.0",
    "Accept": "application/json",
    "Connection": "keep-alive",
    "Auth": "Bearer 2*/f3+fe68df*4",
}
dfs_with_headers = pd.read_html(url, storage_options=headers)
dfs_with_headers[0].head()

Unnamed: 0,Bank NameBank,CityCity,StateSt,CertCert,Acquiring InstitutionAI,Closing DateClosing,FundFund
0,Republic First Bank dba Republic Bank,Philadelphia,PA,27332,"Fulton Bank, National Association","April 26, 2024",10546
1,Citizens Bank,Sac City,IA,8758,Iowa Trust & Savings Bank,"November 3, 2023",10545
2,Heartland Tri-State Bank,Elkhart,KS,25851,"Dream First Bank, N.A.","July 28, 2023",10544
3,First Republic Bank,San Francisco,CA,59017,"JPMorgan Chase Bank, N.A.","May 1, 2023",10543
4,Signature Bank,New York,NY,57053,"Flagstar Bank, N.A.","March 12, 2023",10540


#### Reading HTML content from a string.

In [105]:
html_str = """
<table>
    <tr>
        <th>A</th>
        <th>B</th>
        <th>C</th>
    </tr>
    <tr>
        <td>a</td>
        <td>b</td>
        <td>c</td>
    </tr>
</table>
"""
dfs_from_string = pd.read_html(StringIO(html_str))
dfs_from_string

[   A  B  C
 0  a  b  c]

#### Extracting links.

The `extract_links="all"` parameter in `pd.read_html()` is used to extract both the display text and the hyperlink from HTML anchor tags (`<a>`). This is useful when you want to capture the URL along with the text displayed in the HTML table.

```
df_with_links[df_with_links.columns[0]] = df_with_links[df_with_links.columns[0]].apply(lambda x: x[1])
```

- `df_with_links[df_with_links.columns[0]]` accesses the first column of the DataFrame.

- `.apply(lambda x: x[1]):`

The `apply` method is used to apply a function along the axis of the DataFrame.

`lambda x: x[1]` is an anonymous function (lambda function) that takes an input x and returns the second element (x[1]).

Else the output would look like this:

GitHub
                                         
0  (pandas, https://github.com/pandas-dev/pandas)


**The line of code extracts the URLs from the tuples in the first column of the DataFrame and replaces the tuples with just the URLs.**

In [114]:
html_table = """
<table>
  <tr>
    <th>GitHub</th>
  </tr>
  <tr>
    <td><a href="https://github.com/pandas-dev/pandas">pandas</a></td>
  </tr>
</table>
"""
# Read the HTML table and extract links
df_with_links = pd.read_html(StringIO(html_table), extract_links="all")[0]

# Flatten the MultiIndex columns else there would be 'none' in the column names next to 'GitHub'
df_with_links.columns = ["GitHub"]

# Extract the links from the tuples
df_with_links[df_with_links.columns[0]] = df_with_links[df_with_links.columns[0]].apply(lambda x: x[1])

df_with_links

Unnamed: 0,GitHub
0,https://github.com/pandas-dev/pandas


#### Writing to HTML files

The `DataFrame.to_html` method in pandas allows you to render the contents of a DataFrame as an HTML table.

The `display(HTML(html))` function call is used to render HTML content within a Jupyter Notebook or other IPython environments.

- `HTML(html)` converts the HTML string into an IPython display object.

- `display(HTML(html))` then renders this HTML content in the notebook's output cell.

In [121]:
from IPython.display import display, HTML

In [131]:
# Basic Usage:
df = pd.DataFrame(np.random.randn(2, 2))

html = df.to_html()
display(HTML(html))

Unnamed: 0,0,1
0,-1.305164,-0.229996
1,0.611055,-1.376548


In [132]:
# Limit Columns:
df = pd.DataFrame(np.random.randn(2, 2))

html = df.to_html(columns=[0])
display(HTML(html))

Unnamed: 0,0
0,0.900499
1,1.052825


In [133]:
# Float Format:
df = pd.DataFrame(np.random.randn(2, 2))

html = df.to_html(float_format="{0:.10f}".format)
display(HTML(html))

Unnamed: 0,0,1
0,1.1134106576,-0.285630825
1,-0.737060445,0.4725784958


In [134]:
# Bold Rows:
df = pd.DataFrame(np.random.randn(2, 2))

html = df.to_html(bold_rows=False)
display(HTML(html))

Unnamed: 0,0,1
0,1.997239,-0.898902
1,1.06334,0.471089


In [135]:
# CSS Classes:
df = pd.DataFrame(np.random.randn(2, 2))

html = df.to_html(classes=["awesome_table_class", "even_more_awesome_class"])
display(HTML(html))

Unnamed: 0,0,1
0,-0.296448,0.324469
1,-1.159522,-0.996556


In [138]:
# Render Links:
url_df = pd.DataFrame(
    {
        "name": ["Python", "pandas"],
        "url": ["https://www.python.org/", "https://pandas.pydata.org"],
    }
)
html = url_df.to_html(render_links=True)
display(HTML(html))

# Print the raw HTML code
print(html)

Unnamed: 0,name,url
0,Python,https://www.python.org/
1,pandas,https://pandas.pydata.org


<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>name</th>
      <th>url</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>Python</td>
      <td><a href="https://www.python.org/" target="_blank">https://www.python.org/</a></td>
    </tr>
    <tr>
      <th>1</th>
      <td>pandas</td>
      <td><a href="https://pandas.pydata.org" target="_blank">https://pandas.pydata.org</a></td>
    </tr>
  </tbody>
</table>


In [139]:
# Escape Characters:
df = pd.DataFrame({"a": list("&<>"), "b": np.random.randn(3)})

# Escaped
html_escaped = df.to_html()
display(HTML(html_escaped))

# Not Escaped
html_not_escaped = df.to_html(escape=False)
display(HTML(html_not_escaped))

# Print the raw HTML code
print(html_escaped)
print(html_not_escaped)

Unnamed: 0,a,b
0,&,1.23277
1,<,-0.635829
2,>,-0.747877


Unnamed: 0,a,b
0,&,1.23277
1,<,-0.635829
2,>,-0.747877


<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>a</th>
      <th>b</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>&amp;</td>
      <td>1.232770</td>
    </tr>
    <tr>
      <th>1</th>
      <td>&lt;</td>
      <td>-0.635829</td>
    </tr>
    <tr>
      <th>2</th>
      <td>&gt;</td>
      <td>-0.747877</td>
    </tr>
  </tbody>
</table>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>a</th>
      <th>b</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>&</td>
      <td>1.232770</td>
    </tr>
    <tr>
      <th>1</th>
      <td><</td>
      <td>-0.635829</td>
    </tr>
    <tr>
      <th>2</th>
      <td>></td>
      <td>-0.747877</td>
    </tr>
  </tbody>
</table>


---

### XML

#### Reading XML

To read XML data into a pandas DataFrame, you can use the `read_xml()` function.

This function can handle XML strings, files, and URLs.

In [141]:
# Reading an XML String:
from io import StringIO

xml_data = """
<data>
    <row>
        <id>1</id>
        <name>John Doe</name>
    </row>
    <row>
        <id>2</id>
        <name>Jane Doe</name>
    </row>
</data>
"""

# Wrap the XML string in a StringIO object
xml_io = StringIO(xml_data)

# Read the XML data
df = pd.read_xml(xml_io)

print(df)

   id      name
0   1  John Doe
1   2  Jane Doe


In [143]:
# Reading an XML File:
df = pd.read_xml("../datasets/user.xml")
print(df)

   id       name
0   1  Jason Doe
1   2   Jane Doe


In [146]:
#Reading an XML from a URL
'''df = pd.read_xml("https://example.com/data.xml")
print(df)'''

'df = pd.read_xml("https://example.com/data.xml")\nprint(df)'

#### Writing XML

In [152]:
# Converts a DataFrame to an XML string.
geom_df = pd.DataFrame(
    {
        "shape": ["square", "circle", "triangle"],
        "degrees": [360, 360, 180],
        "sides": [4, np.nan, 3],
    }
)
print(geom_df.to_xml())

<?xml version='1.0' encoding='utf-8'?>
<data>
  <row>
    <index>0</index>
    <shape>square</shape>
    <degrees>360</degrees>
    <sides>4.0</sides>
  </row>
  <row>
    <index>1</index>
    <shape>circle</shape>
    <degrees>360</degrees>
    <sides/>
  </row>
  <row>
    <index>2</index>
    <shape>triangle</shape>
    <degrees>180</degrees>
    <sides>3.0</sides>
  </row>
</data>
