In [1]:
import pandas as pd
import numpy as np

# Sample scraped data
data = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', None, 'David', 'Eve'],
    'age': ['25', '30', None, '45', '40', ''],
    'date_joined': ['2023-01-01', '2022-12-31', '2023-01-15', None, '2023-01-20', '2023-01-25'],
    'email': ['alice@example.com', 'bob@example', 'charlie@example.com', None, 'david@example.com', 'eve@example.com']
})

print("Original Data:")
print(data)

# Step 1: Handle missing values
data['name'].fillna('Unknown', inplace=True)

# Convert age to numeric and fill missing values with the median age
data['age'] = pd.to_numeric(data['age'], errors='coerce')
data['age'].fillna(data['age'].median(), inplace=True)

# Forward fill missing dates
data['date_joined'].fillna(method='ffill', inplace=True)

# Handle invalid email formats
data['email'] = data['email'].apply(lambda x: x if '@' in str(x) else 'unknown@example.com')

print("\nData after handling missing values and basic validation:")
print(data)

# Step 2: Standardize formats
data['date_joined'] = pd.to_datetime(data['date_joined'])
data['date_joined'] = data['date_joined'].dt.strftime('%Y-%m-%d')

print("\nData after standardizing formats:")
print(data)

# Step 3: Save to CSV and JSON
data.to_csv('cleaned_data.csv', index=False)
print("\nData saved to cleaned_data.csv")

data.to_json('cleaned_data.json', orient='records', lines=True)
print("\nData saved to cleaned_data.json")


Original Data:
      name   age date_joined                email
0    Alice    25  2023-01-01    alice@example.com
1      Bob    30  2022-12-31          bob@example
2  Charlie  None  2023-01-15  charlie@example.com
3     None    45        None                 None
4    David    40  2023-01-20    david@example.com
5      Eve        2023-01-25      eve@example.com

Data after handling missing values and basic validation:
      name   age date_joined                email
0    Alice  25.0  2023-01-01    alice@example.com
1      Bob  30.0  2022-12-31          bob@example
2  Charlie  35.0  2023-01-15  charlie@example.com
3  Unknown  45.0  2023-01-15  unknown@example.com
4    David  40.0  2023-01-20    david@example.com
5      Eve  35.0  2023-01-25      eve@example.com

Data after standardizing formats:
      name   age date_joined                email
0    Alice  25.0  2023-01-01    alice@example.com
1      Bob  30.0  2022-12-31          bob@example
2  Charlie  35.0  2023-01-15  charlie@exam