# CSV and Excel Files - Structured Data

In [1]:
import pandas as pd
import os

In [2]:
os.makedirs('data/structured_files', exist_ok=True)

In [3]:
# Create Sample data
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],   
    'Age': [25, 30, 35],
    'City': ['New York', 'Los Angeles', 'Chicago']
}

df = pd.DataFrame(data)
df.to_csv('data/structured_files/sample_data.csv', index=False)

In [6]:
# Save as excel with multiple sheets
with pd.ExcelWriter('data/structured_files/sample_data.xlsx') as writer:
    df.to_excel(writer, sheet_name='People', index=False)
    summary_data = {
        'Category': ['A', 'B', 'C'],
        'Value': [100, 200, 300]
    }
    pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)

# CSV Processing

In [7]:
from langchain_community.document_loaders import CSVLoader, UnstructuredCSVLoader

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# Method 1: Using CSVLoader - Each row becomes a separate document
print("Using CSVLoader - Row-based parsing")
csv_loader = CSVLoader(file_path='data/structured_files/sample_data.csv', 
                       encoding='utf-8', 
                       csv_args={'delimiter': ',', 
                                 'quotechar': '"'})

csv_docs = csv_loader.load()
print(csv_docs)
print(f"Loaded {len(csv_docs)} documents using CSVLoader.(Each row is a document)")
print("\nFirst document:")
print(f"Content:\n{csv_docs[0].page_content}")
print(f"Metadata: {csv_docs[0].metadata}")

Using CSVLoader - Row-based parsing
[Document(metadata={'source': 'data/structured_files/sample_data.csv', 'row': 0}, page_content='Name: Alice\nAge: 25\nCity: New York'), Document(metadata={'source': 'data/structured_files/sample_data.csv', 'row': 1}, page_content='Name: Bob\nAge: 30\nCity: Los Angeles'), Document(metadata={'source': 'data/structured_files/sample_data.csv', 'row': 2}, page_content='Name: Charlie\nAge: 35\nCity: Chicago')]
Loaded 3 documents using CSVLoader.(Each row is a document)

First document:
Content:
Name: Alice
Age: 25
City: New York
Metadata: {'source': 'data/structured_files/sample_data.csv', 'row': 0}


In [12]:
from typing import List
from langchain_core.documents import Document

## Method 2: Custom CSV processing for better control
print({"\n Custom CSV Processing"})
def process_csv_intelligently(file_path: str) -> List[Document]:
    df = pd.read_csv(file_path)
    documents = []

    for index, row in df.iterrows():
        content = f"""Information:
        Name: {row['Name']}
        Age: {row['Age']}
        City: {row['City']}
        """
        # Create document with metadata
        doc = Document(
            page_content=content,
            metadata={
                'source': file_path,
                'row_index': index,
                'name': row['Name'],
                'data_type': 'person_info'
            }
        )
        documents.append(doc)
    return documents

{'\n Custom CSV Processing'}


In [13]:
process_csv_intelligently('data/structured_files/sample_data.csv')

[Document(metadata={'source': 'data/structured_files/sample_data.csv', 'row_index': 0, 'name': 'Alice', 'data_type': 'person_info'}, page_content='Information:\n        Name: Alice\n        Age: 25\n        City: New York\n        '),
 Document(metadata={'source': 'data/structured_files/sample_data.csv', 'row_index': 1, 'name': 'Bob', 'data_type': 'person_info'}, page_content='Information:\n        Name: Bob\n        Age: 30\n        City: Los Angeles\n        '),
 Document(metadata={'source': 'data/structured_files/sample_data.csv', 'row_index': 2, 'name': 'Charlie', 'data_type': 'person_info'}, page_content='Information:\n        Name: Charlie\n        Age: 35\n        City: Chicago\n        ')]

# Excel Processing

In [16]:
from langchain_community.document_loaders import UnstructuredExcelLoader

try:
    excel_loader = UnstructuredExcelLoader(file_path='data/structured_files/sample_data.xlsx',
                                           mode='elements')
    excel_docs = excel_loader.load()
except Exception as e:
    print(f"Error loading Excel file: {e}")
    excel_docs = []

excel_docs

[Document(metadata={'source': 'data/structured_files/sample_data.xlsx', 'file_directory': 'data/structured_files', 'filename': 'sample_data.xlsx', 'last_modified': '2025-10-24T13:55:05', 'page_name': 'People', 'page_number': 1, 'text_as_html': '<table><tr><td>Name</td><td>Age</td><td>City</td></tr><tr><td>Alice</td><td>25</td><td>New York</td></tr><tr><td>Bob</td><td>30</td><td>Los Angeles</td></tr><tr><td>Charlie</td><td>35</td><td>Chicago</td></tr></table>', 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'category': 'Table', 'element_id': 'c81e766b3ae368e11fa7d14da551dea7'}, page_content='Name Age City Alice 25 New York Bob 30 Los Angeles Charlie 35 Chicago'),
 Document(metadata={'source': 'data/structured_files/sample_data.xlsx', 'file_directory': 'data/structured_files', 'filename': 'sample_data.xlsx', 'last_modified': '2025-10-24T13:55:05', 'page_name': 'Summary', 'page_number': 2, 'text_as_html': '<table><tr><td>Category</td