### Handling Unstructured Data with Python
**Description**: Extract structured data from unstructured text using Python.

**Steps**:
1. Load and analyze an unstructured text document.
2. Extract information using regex.

In [None]:

import re
import pandas as pd
import os
from collections import defaultdict
def extract_from_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    print(f"Document length: {len(text)} characters")
    print(f"First 200 characters of document:\n{text[:200]}...")
    extracted_data = extract_structured_data(text)
    return extracted_data
def extract_structured_data(text):
    results = defaultdict(list)
    email_pattern = r'[\w.+-]+@[\w-]+\.[\w.-]+'
    emails = re.findall(email_pattern, text)
    results['emails'] = emails
    phone_pattern = r'\b(?:\+\d{1,2}\s?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b'
    phones = re.findall(phone_pattern, text)
    results['phones'] = phones
    date_pattern = r'\b(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4})|(?:(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s\d{1,2},?\s\d{2,4})\b'
    dates = re.findall(date_pattern, text)
    results['dates'] = dates
    url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
    urls = re.findall(url_pattern, text)
    results['urls'] = urls
    money_pattern = r'\$\s?\d+(?:\.\d{2})?'
    money_amounts = re.findall(money_pattern, text)
    results['money_amounts'] = money_amounts
    name_pattern = r'(?:Mr\.|Ms\.|Mrs\.|Dr\.|Prof\.)\s[A-Z][a-z]+\s[A-Z][a-z]+'
    names = re.findall(name_pattern, text)
    results['names'] = names
    ssn_pattern = r'\b\d{3}-\d{2}-\d{4}\b'
    ssns = re.findall(ssn_pattern, text)
    results['ssns'] = ssns
    address_pattern = r'\d+\s[A-Za-z\s]+,\s[A-Za-z\s]+,\s[A-Z]{2}\s\d{5}'
    addresses = re.findall(address_pattern, text)
    results['addresses'] = addresses
    custom_pattern = r'ID:\s*(\w+)'
    ids = re.findall(custom_pattern, text)
    results['ids'] = ids
    return results
def save_to_structured_format(extracted_data, output_file='extracted_data.csv'):
    max_length = max([len(v) for v in extracted_data.values()], default=0)
    structured_data = {}
    for key, values in extracted_data.items():
        for i in range(max_length):
            value = values[i] if i < len(values) else None
            if i not in structured_data:
                structured_data[i] = {}
            structured_data[i][key] = value
    df = pd.DataFrame.from_dict(structured_data, orient='index')
    df.to_csv(output_file, index=False)
    print(f"Structured data saved to {output_file}")
    return df
def visualize_extraction_results(extracted_data):
    for key, values in extracted_data.items():
        print(f"\n{key.upper()} ({len(values)} found):")
        for i, value in enumerate(values[:10], 1):
            print(f"  {i}. {value}")
        if len(values) > 10:
            print(f"  ... and {len(values) - 10} more")
def main(file_path):
    print(f"Processing file: {file_path}")
    if not os.path.exists(file_path):
        print(f"Error: File {file_path} not found.")
        return
    extracted_data = extract_from_text(file_path)
    visualize_extraction_results(extracted_data)
    df = save_to_structured_format(extracted_data)
    print("\nExtracted Data Sample:")
    print(df.head())
    return extracted_data, df

if __name__ == "__main__":
    file_path = "unstructured_document.txt"
    main(file_path)