## Data ingestion Of excel/CSV files 

In [5]:
import pandas as pd
from langchain_core.documents import Document
from pathlib import Path


In [9]:
def process_all_structured_files(directory):
    """Process all CSV and Excel files in a directory"""
    all_documents = []
    data_dir = Path(directory)
    structured_files = list(data_dir.glob("**/*.csv")) + \
                       list(data_dir.glob("**/*.xlsx")) + \
                       list(data_dir.glob("**/*.xls"))
    print(f"Found {len(structured_files)} structured files")
    for file in structured_files:
        print(f"\nProcessing: {file.name}")
        try:
            if file.suffix == ".csv":
                df = pd.read_csv(file)
            else:
                df = pd.read_excel(file)
            df = df.dropna(how="all")
            df = df.fillna("")
            for index, row in df.iterrows():
                text_parts = []
                metadata = {}

                for column in df.columns:
                    value = str(row[column]).strip()
                    if value:
                        text_parts.append(f"{column}: {value}")
                        metadata[column] = value

                page_content = ". ".join(text_parts)

                doc = Document(
                    page_content=page_content,
                    metadata={
                        **metadata,
                        "source_file": file.name,
                        "row_index": index,
                        "file_type": "structured"
                    }
                )

                all_documents.append(doc)

            print(f"Loaded {len(df)} rows")

        except Exception as e:
            print(f"Error in {file.name}: {e}")

    print(f"\nTotal structured records: {len(all_documents)}")
    return all_documents

docs = process_all_structured_files("../data")

Found 2 structured files

Processing: Employee.csv
Loaded 4653 rows

Processing: HR-Employee-Attrition.csv
Loaded 1470 rows

Total structured records: 6123
