In [None]:
```python
import ipfshttpclient
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID, NUMERIC
from whoosh.qparser import QueryParser
import os

# Load configuration from config.json file
config_path = "config.json"  # Update this path as needed
with open(config_path, "r") as f:
    config = json.load(f)

IPFS_ADDRESS = config["IPFS_ADDRESS"]

# Connect to the IPFS node at a specific IP address and port
ipfs_address = f"/dns/{config['IPFS_ADDRESS']}/tcp/{config['IPFS_PORT']}/http"
client = ipfshttpclient.connect(ipfs_address)

# Define schema for indexing
schema = Schema(
    cid=ID(stored=True),               # The IPFS CID
    name=TEXT(stored=True),            # Filename
    size=NUMERIC(stored=True),         # File size
    filetype=TEXT(stored=True),        # File type (MIME type)
)

# Create the index directory
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

# Create index in the directory
ix = create_in("indexdir", schema)

# Function to index IPFS content
def index_ipfs_content(cid, filename):
    try:
        # Fetch file stats from IPFS
        stats = client.object.stat(cid)
        file_size = stats['CumulativeSize']
        
        # Fetch file info (you can expand this for more file types)
        filetype = filename.split(".")[-1] if "." in filename else "unknown"

        # Index the content
        writer = ix.writer()
        writer.add_document(
            cid=cid,
            name=filename,
            size=file_size,
            filetype=filetype,
        )
        writer.commit()
        print(f"Indexed {filename} with CID: {cid}")
        
    except Exception as e:
        print(f"Failed to index CID {cid}: {e}")

# Search IPFS index based on keyword
def search_ipfs(keyword):
    with ix.searcher() as searcher:
        query = QueryParser("name", ix.schema).parse(keyword)
        results = searcher.search(query)
        if results:
            for result in results:
                print(f"CID: {result['cid']}, Name: {result['name']}, Size: {result['size']} bytes, File Type: {result['filetype']}")
        else:
            print(f"No results found for '{keyword}'")

# Example usage: Index some files on your local IPFS node
index_ipfs_content("QmXj...", "example.txt")  # Replace with actual CID and filename
index_ipfs_content("QmYk...", "document.pdf")  # Replace with actual CID and filename

# Example search
search_ipfs("example")


In [3]:
import ipfshttpclient
import whoosh
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID, NUMERIC
from whoosh.qparser import QueryParser
import os
import json


# Load configuration from config.json file
config_path = "config.json"  # Update this path as needed
with open(config_path, "r") as f:
    config = json.load(f)

IPFS_ADDRESS = config["IPFS_ADDRESS"]

# Connect to the IPFS node at a specific IP address and port
ipfs_address = f"/dns/{config['IPFS_ADDRESS']}/tcp/{config['IPFS_PORT']}/http"
client = ipfshttpclient.connect(ipfs_address)


# Connect to local IPFS
# client = ipfshttpclient.connect()

# Define schema for indexing
schema = Schema(
    cid=ID(stored=True),               # The IPFS CID
    name=TEXT(stored=True),            # Filename
    size=NUMERIC(stored=True),         # File size
    filetype=TEXT(stored=True),        # File type (MIME type)
)

# Create the index directory
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

# Create index in the directory
ix = create_in("indexdir", schema)

# Function to index IPFS content
def index_ipfs_content(cid, filename):
    try:
        # Fetch file stats from IPFS
        stats = client.object.stat(cid)
        file_size = stats['CumulativeSize']
        
        # Fetch file info (you can expand this for more file types)
        filetype = filename.split(".")[-1] if "." in filename else "unknown"

        # Index the content
        writer = ix.writer()
        writer.add_document(
            cid=cid,
            name=filename,
            size=file_size,
            filetype=filetype,
        )
        writer.commit()
        print(f"Indexed {filename} with CID: {cid}")
        
    except Exception as e:
        print(f"Failed to index CID {cid}: {e}")

# Search IPFS index based on keyword
def search_ipfs(keyword):
    with ix.searcher() as searcher:
        query = QueryParser("name", ix.schema).parse(keyword)
        results = searcher.search(query)
        if results:
            for result in results:
                print(f"CID: {result['cid']}, Name: {result['name']}, Size: {result['size']} bytes, File Type: {result['filetype']}")
        else:
            print(f"No results found for '{keyword}'")

# Example usage: Index some files on your local IPFS node
index_ipfs_content("QmXh37WXcrLXkJX2cAPjbPdKZ2cxJXD58XX6eU1Zc41ka4", "upload/Aeroacoustic-airfoil-shape-optimization-enhance_2023_Expert-Systems-with-App.pdf")  # Replace with actual CID and filename
index_ipfs_content("QmSSY49SnmbCZ3oSaTki7CYZe1ZaWZfE1CsWHpt8Ge7acJ", "upload/World_Energy_By_Country_And_Region_1965_to_2023.csv")  # Replace with actual CID and filename
index_ipfs_content("QmdiRawzVNUiB28ENKQ7WefeFLEJ1xMjsJjwtHL2jnJ9xW", "upload/Munafò et al. - 2022 - The reproducibility debate is an opportunity, not .pdf")  # Replace with actual CID and filename

# Example search
search_ipfs("reproducibility")






Indexed upload/Aeroacoustic-airfoil-shape-optimization-enhance_2023_Expert-Systems-with-App.pdf with CID: QmXh37WXcrLXkJX2cAPjbPdKZ2cxJXD58XX6eU1Zc41ka4
Indexed upload/World_Energy_By_Country_And_Region_1965_to_2023.csv with CID: QmSSY49SnmbCZ3oSaTki7CYZe1ZaWZfE1CsWHpt8Ge7acJ
Indexed upload/Munafò et al. - 2022 - The reproducibility debate is an opportunity, not .pdf with CID: QmdiRawzVNUiB28ENKQ7WefeFLEJ1xMjsJjwtHL2jnJ9xW
CID: QmdiRawzVNUiB28ENKQ7WefeFLEJ1xMjsJjwtHL2jnJ9xW, Name: upload/Munafò et al. - 2022 - The reproducibility debate is an opportunity, not .pdf, Size: 694719 bytes, File Type: pdf


Here's a basic Python script to implement a simple IPFS search engine for your local IPFS server. It will crawl through the IPFS content, extract basic metadata (like file size, type, and CID), and enable a keyword-based search for files with the help of well-known Python modules.

### Prerequisites:
1. **Install the IPFS HTTP client library**:
   - You can use the `ipfshttpclient` module to interact with your IPFS server. Install it with:
     ```bash
     pip install ipfshttpclient
     ```
   
2. **Install `Whoosh` for search indexing**:
   - We'll use `Whoosh` for building and querying the search index. Install it with:
     ```bash
     pip install whoosh
     ```

### Basic Python Search Engine for IPFS

```python
import ipfshttpclient
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID, NUMERIC
from whoosh.qparser import QueryParser
import os

# Load configuration from config.json file
config_path = "config.json"  # Update this path as needed
with open(config_path, "r") as f:
    config = json.load(f)

IPFS_ADDRESS = config["IPFS_ADDRESS"]

# Connect to the IPFS node at a specific IP address and port
ipfs_address = f"/dns/{config['IPFS_ADDRESS']}/tcp/{config['IPFS_PORT']}/http"
client = ipfshttpclient.connect(ipfs_address)

# Define schema for indexing
schema = Schema(
    cid=ID(stored=True),               # The IPFS CID
    name=TEXT(stored=True),            # Filename
    size=NUMERIC(stored=True),         # File size
    filetype=TEXT(stored=True),        # File type (MIME type)
)

# Create the index directory
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

# Create index in the directory
ix = create_in("indexdir", schema)

# Function to index IPFS content
def index_ipfs_content(cid, filename):
    try:
        # Fetch file stats from IPFS
        stats = client.object.stat(cid)
        file_size = stats['CumulativeSize']
        
        # Fetch file info (you can expand this for more file types)
        filetype = filename.split(".")[-1] if "." in filename else "unknown"

        # Index the content
        writer = ix.writer()
        writer.add_document(
            cid=cid,
            name=filename,
            size=file_size,
            filetype=filetype,
        )
        writer.commit()
        print(f"Indexed {filename} with CID: {cid}")
        
    except Exception as e:
        print(f"Failed to index CID {cid}: {e}")

# Search IPFS index based on keyword
def search_ipfs(keyword):
    with ix.searcher() as searcher:
        query = QueryParser("name", ix.schema).parse(keyword)
        results = searcher.search(query)
        if results:
            for result in results:
                print(f"CID: {result['cid']}, Name: {result['name']}, Size: {result['size']} bytes, File Type: {result['filetype']}")
        else:
            print(f"No results found for '{keyword}'")

# Example usage: Index some files on your local IPFS node
index_ipfs_content("QmXj...", "example.txt")  # Replace with actual CID and filename
index_ipfs_content("QmYk...", "document.pdf")  # Replace with actual CID and filename

# Example search
search_ipfs("example")
```

### Explanation:

1. **Connecting to IPFS**:
   - The script connects to your local IPFS node using `ipfshttpclient`.
   
2. **Indexing Files**:
   - The `index_ipfs_content` function takes an IPFS CID and a filename as input.
   - It fetches file metadata (such as file size) using the `object.stat` function from the IPFS client and stores this data in an index using `Whoosh`.

3. **Schema**:
   - The schema defines how the files are indexed. We store the CID, filename, file size, and type.

4. **Searching**:
   - The `search_ipfs` function allows you to search the index based on filenames.
   - It uses `Whoosh`'s `QueryParser` to search the index and print out the search results.

### Running the Script:
- Run the script in Python, and it will index files from your local IPFS server and allow you to search for them using keywords.
- Replace `"QmXj..."` and `"QmYk..."` with actual IPFS CIDs and filenames from your server.

This setup keeps the search engine simple and easily expandable for more metadata or complex searches later.

To integrate **Apache Tika** for metadata extraction into the search engine, we need to parse and extract metadata from each file uploaded to IPFS using Tika. This metadata will be indexed alongside the existing file information (like CID and size) and then made searchable.

### Steps to implement Tika-based metadata extraction:
1. **Install the `tika` library**:
   Apache Tika provides a Python wrapper (`tika` library) to access its file metadata extraction features. Install it using:
   ```bash
   pip install tika
   ```

2. **Modify the Python script**:
   We will update the `index_ipfs_content` function to:
   - Download the file from IPFS.
   - Use Tika to extract the file's metadata.
   - Index both the CID and the metadata (like author, title, content type) in Whoosh.

Here's the updated code:

```python
import ipfshttpclient
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID, NUMERIC
from whoosh.qparser import QueryParser
import os
from tika import parser  # Apache Tika for metadata extraction

# Connect to local IPFS
client = ipfshttpclient.connect()

# Define schema for indexing (now includes metadata fields from Tika)
schema = Schema(
    cid=ID(stored=True),                 # The IPFS CID
    name=TEXT(stored=True),              # Filename
    size=NUMERIC(stored=True),           # File size
    filetype=TEXT(stored=True),          # File type (MIME type)
    title=TEXT(stored=True),             # Extracted title (from Tika metadata)
    author=TEXT(stored=True),            # Extracted author (from Tika metadata)
    keywords=TEXT(stored=True),          # Extracted keywords (from Tika metadata)
    full_text=TEXT(stored=False),        # Extracted full text of document (if applicable)
)

# Create the index directory
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

# Create index in the directory
ix = create_in("indexdir", schema)

# Function to download file from IPFS and save locally
def download_file_from_ipfs(cid, filename):
    file_path = f"./{filename}"
    try:
        # Fetch the file from IPFS and save it locally
        with open(file_path, "wb") as file:
            client.get(cid, file=file_path)
        return file_path
    except Exception as e:
        print(f"Failed to download file with CID {cid}: {e}")
        return None

# Function to extract metadata using Apache Tika
def extract_metadata(file_path):
    try:
        parsed = parser.from_file(file_path)
        metadata = parsed.get("metadata", {})
        content = parsed.get("content", "").strip()

        # Extract relevant metadata fields
        title = metadata.get("title", "Unknown")
        author = metadata.get("Author", "Unknown")
        keywords = metadata.get("Keywords", "")
        
        return title, author, keywords, content
    except Exception as e:
        print(f"Failed to extract metadata for file {file_path}: {e}")
        return "Unknown", "Unknown", "", ""

# Function to index IPFS content along with Tika metadata
def index_ipfs_content(cid, filename):
    try:
        # Step 1: Download file from IPFS
        file_path = download_file_from_ipfs(cid, filename)
        if not file_path:
            return
        
        # Step 2: Extract metadata using Tika
        title, author, keywords, full_text = extract_metadata(file_path)
        
        # Step 3: Fetch file stats from IPFS
        stats = client.object.stat(cid)
        file_size = stats['CumulativeSize']

        # Step 4: Get file type (basic MIME type)
        filetype = filename.split(".")[-1] if "." in filename else "unknown"

        # Step 5: Index the content
        writer = ix.writer()
        writer.add_document(
            cid=cid,
            name=filename,
            size=file_size,
            filetype=filetype,
            title=title,
            author=author,
            keywords=keywords,
            full_text=full_text,  # Not stored, just used for searching
        )
        writer.commit()
        print(f"Indexed {filename} with CID: {cid}, Title: {title}, Author: {author}")

        # Optionally: Clean up local file
        os.remove(file_path)
        
    except Exception as e:
        print(f"Failed to index CID {cid}: {e}")

# Search IPFS index based on keyword or metadata
def search_ipfs(keyword):
    with ix.searcher() as searcher:
        query = QueryParser("full_text", ix.schema).parse(keyword)
        results = searcher.search(query)
        if results:
            for result in results:
                print(f"CID: {result['cid']}, Name: {result['name']}, Title: {result['title']}, Author: {result['author']}, Size: {result['size']} bytes")
        else:
            print(f"No results found for '{keyword}'")

# Example usage: Index some files on your local IPFS node
index_ipfs_content("QmXj...", "example.txt")  # Replace with actual CID and filename
index_ipfs_content("QmYk...", "document.pdf")  # Replace with actual CID and filename

# Example search
search_ipfs("example")
```

### Breakdown of the changes:

1. **Apache Tika Metadata Extraction**:
   - The function `extract_metadata(file_path)` uses Tika to extract metadata (like title, author, keywords) and the full text (if applicable) from the file.

2. **Downloading Files from IPFS**:
   - The `download_file_from_ipfs` function downloads files from IPFS using the CID and saves them locally for Tika to process.

3. **Schema Updates**:
   - Added `title`, `author`, `keywords`, and `full_text` fields to the search index schema. This allows us to index and search by these additional metadata fields.

4. **Indexing Content**:
   - The `index_ipfs_content` function now downloads the file from IPFS, extracts metadata using Tika, and indexes the file and its metadata in Whoosh.

5. **Search Enhancements**:
   - The `search_ipfs` function searches not only by filenames but also by the full text of the document and metadata extracted by Tika.

### Running the Script:
1. Replace `"QmXj..."` and `"QmYk..."` with actual CIDs of files uploaded to your local IPFS server.
2. Run the script to download, index, and search files with their metadata and content using Apache Tika.

This approach extracts rich metadata and makes it searchable, improving the ability to find relevant content on your IPFS server.