In [112]:
import pandas as pd
import numpy as np
import csv
import json
import re
import xml
from json_repair import repair_json
from io import StringIO

# CSV Data

In [113]:
df = pd.read_csv('Data.csv', header = None, skipinitialspace=True)
df

Unnamed: 0,0
0,"name,age,city,email"
1,"John Doe, 25, New York,john@example.com"
2,"Alice,,Los Angeles, alice@invalid"
3,"Bob Smith, forty, Chicago bob@smith.net"
4,"Charlie Brown, 30 , ""San Francisco"", charliebr..."
5,"Duplicate John Doe,25,New York,john@example.com"


In [114]:
df = df[0].str.split(',', expand = True)
df = df.drop(columns=[4])
df = df.drop(index=0)
df = df.rename(columns={0: 'name', 1: 'age', 2: 'city', 3: 'email',})
df

Unnamed: 0,name,age,city,email
1,John Doe,25,New York,john@example.com
2,Alice,,Los Angeles,alice@invalid
3,Bob Smith,forty,Chicago bob@smith.net,
4,Charlie Brown,30,"""San Francisco""",charliebrown@gmail.com
5,Duplicate John Doe,25,New York,john@example.com


In [115]:
df['city'] = df['city'].str.strip()
splits = df['city'].str.split(' ', n=1, expand=True)

mask = df['email'].isna() & splits[1].str.contains('@', na=False)
df.loc[mask, 'email'] = splits.loc[mask, 1]
df.loc[mask, 'city'] = splits.loc[mask, 0]

In [116]:
columns = ['name', 'age', 'city']
for col in columns:
    df[col] = df[col].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
    df[col] = df[col].str.strip()

df['email'] = df['email'].str.strip()
df

Unnamed: 0,name,age,city,email
1,John Doe,25,New York,john@example.com
2,Alice,,Los Angeles,alice@invalid
3,Bob Smith,forty,Chicago,bob@smith.net
4,Charlie Brown,30,San Francisco,charliebrown@gmail.com
5,Duplicate John Doe,25,New York,john@example.com


In [117]:
csv_json_array = df.to_dict(orient='records')
print(json.dumps(csv_json_array, indent=4))

[
    {
        "name": "John Doe",
        "age": "25",
        "city": "New York",
        "email": "john@example.com"
    },
    {
        "name": "Alice",
        "age": "",
        "city": "Los Angeles",
        "email": "alice@invalid"
    },
    {
        "name": "Bob Smith",
        "age": "forty",
        "city": "Chicago",
        "email": "bob@smith.net"
    },
    {
        "name": "Charlie Brown",
        "age": "30",
        "city": "San Francisco",
        "email": "charliebrown@gmail.com"
    },
    {
        "name": "Duplicate John Doe",
        "age": "25",
        "city": "New York",
        "email": "john@example.com"
    }
]


# JSON Data

In [118]:
pip install json-repair

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [119]:
with open('Data.json', 'r') as f:
    raw_data = f.read()
    fixed_json = repair_json(raw_data)
    data = json.loads(fixed_json)
    json_array = json.dumps(data, indent=4)
print(json_array)

[
    {
        "name": "Eve Green",
        "age": 28,
        "city": "Seattle",
        "email": "eve@green.org"
    },
    {
        "name": "Frank",
        "age": "35",
        "city": "Boston",
        "email": "frank@example.com"
    },
    {
        "duplicate": "Alice",
        "age": "N/A",
        "city": "Los Angeles",
        "email": "alice@invalid"
    }
]


In [120]:
for obj in data:
    if 'duplicate' in obj and 'name' not in obj:
        obj['name'] = obj.pop('duplicate')

    
json_array = json.dumps(data, indent=4)
print(json_array)

[
    {
        "name": "Eve Green",
        "age": 28,
        "city": "Seattle",
        "email": "eve@green.org"
    },
    {
        "name": "Frank",
        "age": "35",
        "city": "Boston",
        "email": "frank@example.com"
    },
    {
        "age": "N/A",
        "city": "Los Angeles",
        "email": "alice@invalid",
        "name": "Alice"
    }
]


In [121]:
json_df = pd.read_json(StringIO(json_array))
json_df

Unnamed: 0,name,age,city,email
0,Eve Green,28.0,Seattle,eve@green.org
1,Frank,35.0,Boston,frank@example.com
2,Alice,,Los Angeles,alice@invalid


# XML Data

In [122]:
with open('data.xml', 'r') as f:
    content = f.read() 

blocks = re.split(r'<\s*person', content, flags=re.IGNORECASE)

rows = []

for block in blocks:
    
    if not block.strip() or 'name' not in block.lower():
        continue
    
    # key="value"
    attrs = dict(re.findall(r'(\w+)="([^"]*)"', block))
    
    if attrs:
        rows.append(attrs)
    else:
        # search for text between > and <
        row = {}
        for field in ["name", "age", "city", "email"]:
            match = re.search(fr"<\s*{field}\s*>(.*?)<", block, re.DOTALL)
            row[field] = match.group(1).strip() if match else None
        rows.append(row)

xml_json_array = json.dumps(rows, indent=4)
print(xml_json_array)

[
    {
        "name": "Grace Hopper",
        "age": "85",
        "city": "Palo Alto",
        "email": "grace@hopper.edu"
    },
    {
        "name": "Henry",
        "age": "invalid",
        "city": "Miami",
        "email": "henry@.com"
    },
    {
        "name": "Ivy",
        "age": "22",
        "city": "Denver",
        "email": "ivy@example.com"
    }
]


In [123]:
xml_df = pd.read_json(StringIO(xml_json_array))
xml_df

Unnamed: 0,name,age,city,email
0,Grace Hopper,85,Palo Alto,grace@hopper.edu
1,Henry,invalid,Miami,henry@.com
2,Ivy,22,Denver,ivy@example.com


# Plain Text Data

In [124]:
txt_json_output = []

pattern = r'(name|age|city|email)=([^,]+)'

with open('Data.txt', 'r') as f:
    for line in f:
        matches = re.findall(pattern, line)
        
        if matches:
            row = {k.strip(): v.strip() for k, v in matches}
            txt_json_output.append(row)


txt_json_array = json.dumps(txt_json_output, indent=4)

print(txt_json_array)

[
    {
        "name": "John Doe",
        "age": "25",
        "city": "New York",
        "email": "john@example.com"
    }
]


In [125]:
txt_df = pd.read_json(StringIO(txt_json_array))
txt_df

Unnamed: 0,name,age,city,email
0,John Doe,25,New York,john@example.com


## Merging from all the different data sources

In [126]:
merged_df = pd.concat([df, json_df, xml_df, txt_df], ignore_index=True)

In [127]:
merged_df['age'] = pd.to_numeric(merged_df['age'], errors='coerce')
merged_df = merged_df.where(pd.notnull(merged_df), None)
merged_df

Unnamed: 0,name,age,city,email
0,John Doe,25.0,New York,john@example.com
1,Alice,,Los Angeles,alice@invalid
2,Bob Smith,,Chicago,bob@smith.net
3,Charlie Brown,30.0,San Francisco,charliebrown@gmail.com
4,Duplicate John Doe,25.0,New York,john@example.com
5,Eve Green,28.0,Seattle,eve@green.org
6,Frank,35.0,Boston,frank@example.com
7,Alice,,Los Angeles,alice@invalid
8,Grace Hopper,85.0,Palo Alto,grace@hopper.edu
9,Henry,,Miami,henry@.com


In [128]:
json_data = merged_df.to_dict(orient='records')
print(json.dumps(json_data, indent=4))

[
    {
        "name": "John Doe",
        "age": 25.0,
        "city": "New York",
        "email": "john@example.com"
    },
    {
        "name": "Alice",
        "age": NaN,
        "city": "Los Angeles",
        "email": "alice@invalid"
    },
    {
        "name": "Bob Smith",
        "age": NaN,
        "city": "Chicago",
        "email": "bob@smith.net"
    },
    {
        "name": "Charlie Brown",
        "age": 30.0,
        "city": "San Francisco",
        "email": "charliebrown@gmail.com"
    },
    {
        "name": "Duplicate John Doe",
        "age": 25.0,
        "city": "New York",
        "email": "john@example.com"
    },
    {
        "name": "Eve Green",
        "age": 28.0,
        "city": "Seattle",
        "email": "eve@green.org"
    },
    {
        "name": "Frank",
        "age": 35.0,
        "city": "Boston",
        "email": "frank@example.com"
    },
    {
        "name": "Alice",
        "age": NaN,
        "city": "Los Angeles",
        "email": "alice

## 3) update the age of the person living in denver by one

In [129]:
merged_df['age'] = np.where(merged_df['city'] == 'Denver', merged_df['age'] + 1, merged_df['age'])
merged_df

Unnamed: 0,name,age,city,email
0,John Doe,25.0,New York,john@example.com
1,Alice,,Los Angeles,alice@invalid
2,Bob Smith,,Chicago,bob@smith.net
3,Charlie Brown,30.0,San Francisco,charliebrown@gmail.com
4,Duplicate John Doe,25.0,New York,john@example.com
5,Eve Green,28.0,Seattle,eve@green.org
6,Frank,35.0,Boston,frank@example.com
7,Alice,,Los Angeles,alice@invalid
8,Grace Hopper,85.0,Palo Alto,grace@hopper.edu
9,Henry,,Miami,henry@.com


## 4) extract email domains (check if its a valid email too)


In [130]:
email_regex = r'^[a-zA-Z0-9._%+-]+@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})$'
 
merged_df['email_domain'] = merged_df['email'].str.extract(email_regex)
#merged_df['email_domain'] = merged_df['email_domain'].fillna('Invalid Domain')

## 5) generate new column "is_senior" and mark as true if the age of the person is greater than 60 else false.

In [131]:
def is_senior(a):
    return (a > 60)
        

merged_df['is_senior'] = [is_senior(age) for age in merged_df['age']]
merged_df

Unnamed: 0,name,age,city,email,email_domain,is_senior
0,John Doe,25.0,New York,john@example.com,example.com,False
1,Alice,,Los Angeles,alice@invalid,,False
2,Bob Smith,,Chicago,bob@smith.net,smith.net,False
3,Charlie Brown,30.0,San Francisco,charliebrown@gmail.com,gmail.com,False
4,Duplicate John Doe,25.0,New York,john@example.com,example.com,False
5,Eve Green,28.0,Seattle,eve@green.org,green.org,False
6,Frank,35.0,Boston,frank@example.com,example.com,False
7,Alice,,Los Angeles,alice@invalid,,False
8,Grace Hopper,85.0,Palo Alto,grace@hopper.edu,hopper.edu,True
9,Henry,,Miami,henry@.com,,False


## 6) display final result as dataframe with columns (name, age, email_domain,is_senior)


In [132]:
output = merged_df[['name', 'age', 'city', 'email_domain', 'is_senior']]
output

Unnamed: 0,name,age,city,email_domain,is_senior
0,John Doe,25.0,New York,example.com,False
1,Alice,,Los Angeles,,False
2,Bob Smith,,Chicago,smith.net,False
3,Charlie Brown,30.0,San Francisco,gmail.com,False
4,Duplicate John Doe,25.0,New York,example.com,False
5,Eve Green,28.0,Seattle,green.org,False
6,Frank,35.0,Boston,example.com,False
7,Alice,,Los Angeles,,False
8,Grace Hopper,85.0,Palo Alto,hopper.edu,True
9,Henry,,Miami,,False
