# Scraping post information

This notebook collects different posts from the Wordpress CMS, parses the content and excerpt, finally returns the output as per the schema of the Weaviate collection.


## 1.0 Post Schema

```
class Post(WeaviateCollection):
    id:str
    postId:str
    postTitle:str
    postExcerpt:str
    postContent:str
    postDate:datetime
    postAuthor:str
    postCategories:typing.Optional[str]
    postTags:typing.Optional[str]
    postUrl:typing.Optional[str]

    def get_embedding(self):
        return []
```

In [1]:
!pip install beautifulsoup4



In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [64]:
from abc import ABC, abstractmethod
import strawberry
from datetime import datetime
import typing
import requests
from bs4 import BeautifulSoup
import os
import base64
import json
import httpx

In [30]:
class WeaviateCollection(ABC):

    @staticmethod
    @abstractmethod
    def from_dict(data:dict)->typing.Type[typing.Any]:
        pass

    @staticmethod
    @abstractmethod
    def to_dict(data:typing.Any)->dict:
        pass

    @abstractmethod
    def get_embedding(self)->typing.List[float]:
        pass

    @staticmethod
    @abstractmethod
    def get_field_mapping()->dict:
        pass
    

@strawberry.type
class Post(WeaviateCollection):
    uid:typing.Optional[str]
    postId:str
    postTitle:str
    postExcerpt:str
    postContent:str
    postDate:datetime
    postAuthor:str
    postCategories:typing.Optional[str]
    postTags:typing.Optional[str]
    postUrl:typing.Optional[str]


    @staticmethod
    def get_field_mapping()->dict:
        return {
            'postId': 'id',
            'postTitle': 'title',
            'postExcerpt': 'excerpt',
            'postContent': 'content',
            'postDate': 'date_gmt',
            'postAuthor': 'author',
            'postCategories': 'categories',
            'postTags': 'tags',
            'postUrl': 'link'
        }

    @staticmethod
    def to_dict(post):
        return {
            "uid": post.uid,
            "postId": post.postId,
            "postTitle": post.postTitle,
            "postExcerpt": post.postExcerpt,
            "postContent": post.postContent,
            "postDate": post.postDate,
            "postAuthor": post.postAuthor,
            "postCategories": post.postCategories,
            "postTags": post.postTags,
            "postUrl": post.postUrl
        }
    
    @staticmethod
    def from_dict(data:dict):
        if not data or type(data) is not dict:
            return None

        return Post(
            uid=data.get("uid", ""),
            postId=data.get("postId", ""),
            postTitle=data.get("postTitle", ""),
            postExcerpt=data.get("postExcerpt", ""),
            postContent=data.get("postContent", ""),
            postDate=data.get("postDate", ""),
            postAuthor=data.get("postAuthor", ""),
            postCategories=data.get("postCategories", ""),
            postTags=data.get("postTags", ""),
            postUrl=data.get("postUrl", "")
        )
    
    def get_embedding(self):
        return []


## 2.0 Wordpress Endpoints

### 2.1 Post Category

A post belongs to one root category and one or more sub-categories. A **root** category is the category that does not have a parent. Categories with parents are **sub-categories.** This system creates a navigation context for the posts.

**Current root categories:**

- Blogs

- Documentation

**Endpoints:**

- List categories: [https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/categories](https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/categories)

**Sample request:**
```
import requests

url = "https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/categories"

payload={}
headers = {
  'Authorization': 'Basic token'
}

response = requests.request("GET", url, headers=headers, data=payload)

print(response.text)
```

**Sample response:**

```
[
    {
        "id": 37,
        "count": 3,
        "description": "Documentation related to different components of the application layer",
        "link": "https://wpbackend.vip3rtech6069.com/category/documentation/docs-application-layer/",
        "name": "Application Layer",
        "slug": "docs-application-layer",
        "taxonomy": "category",
        "parent": 2,
        "meta": [],
        "_links": {
            "self": [
                {
                    "href": "https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/categories/37"
                }
            ]
        }
    },
    {
        "id": 21,
        "count": 4,
        "description": "Posts related to different blogs",
        "link": "https://wpbackend.vip3rtech6069.com/category/blogs/",
        "name": "Blogs",
        "slug": "blogs",
        "taxonomy": "category",
        "parent": 0,
        "meta": [],
        "_links": {
            "self": [
                {
                    "href": "https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/categories/21"
                }
            ]
        }
    },
    {
        "id": 2,
        "count": 5,
        "description": "Post related to documentation",
        "link": "https://wpbackend.vip3rtech6069.com/category/documentation/",
        "name": "Documentation",
        "slug": "documentation",
        "taxonomy": "category",
        "parent": 0,
        "meta": [],
        "_links": {
            "self": [
                {
                    "href": "https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/categories/2"
                }
            ]
        }
    }
]
```

- Category Detail: [https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/categories/{category_id}](https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/categories/2)

**Sample request:**

```
import requests
category_id=2
url = f"https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/categories/{category_id}"

payload={}
headers = {
  'Authorization': 'Basic token'
}

response = requests.request("GET", url, headers=headers, data=payload)

print(response.text)

```

**Sample response:**

```
{
    "id": 2,
    "count": 5,
    "description": "Post related to documentation",
    "link": "https://wpbackend.vip3rtech6069.com/category/documentation/",
    "name": "Documentation",
    "slug": "documentation",
    "taxonomy": "category",
    "parent": 0,
    "meta": [],
    "_links": {
        "self": [
            {
                "href": "https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/categories/2"
            }
        ]
    }
}
```
### 2.2 Post

A post is a wordpress post displayed on the website. The following parameters of a post will be used for the dataset:

- **ID:** The unique GUID of the post (String).

- **Title:** The title of the post (String - Plain text)

- **Excerpt:** The excerpt of the post (String - html)

- **Publish date:** The date when the post was published (String - UTC formatted string)

- **Author:** The ID of the author of the post (Int - Foreign key)

- **Content:** The content of the post (String - html)

- **Categories:** ID of categories that the post belongs to (List[int] - Foreign key)

- **Tags:** ID of tags that the post has (List[int] - Foreign key)


**Endpoints:**

- Post Detail: [https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/posts/{post_id}](https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/posts/2648)

**Sample request:**

```
import requests

post_id = 2648
url = f"https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/posts/{post_id}"

payload={}
headers = {
  'Authorization': 'Basic token'
}

response = requests.request("GET", url, headers=headers, data=payload)

print(response.text)
```

**Sample response:**

```
{
    "id": 2648,
    "date": "2024-11-17T16:15:52",
    "date_gmt": "2024-11-17T16:15:52",
    "guid": {
        "rendered": "https://wpbackend.vip3rtech6069.com/?p=2648"
    },
    "modified": "2024-11-17T16:19:35",
    "modified_gmt": "2024-11-17T16:19:35",
    "slug": "traversing-a-spiral-matrix",
    "status": "publish",
    "type": "post",
    "link": "https://wpbackend.vip3rtech6069.com/2024/11/17/traversing-a-spiral-matrix/",
    "title": {
        "rendered": "Traversing a spiral matrix"
    },
    "content": {
        "rendered": "<h1>Hello World</h1>",
        "protected": false
    },
    "excerpt": {
        "rendered": "<p>This is an excerpt</p>\n",
        "protected": false
    },
    "author": 3,
    "featured_media": 45,
    "comment_status": "open",
    "ping_status": "open",
    "sticky": false,
    "template": "elementor_canvas",
    "format": "standard",
    "meta": {
        "footnotes": ""
    },
    "categories": [
        15,
        21,
        3
    ],
    "tags": [
        67,
        71,
        14,
        68,
        72,
        70
    ],
    "class_list": [
        "post-2648",
        "post",
        "type-post",
        "status-publish",
        "format-standard",
        "has-post-thumbnail",
        "hentry",
        "category-data-structure-arrays",
        "category-blogs",
        "category-data-structure",
        "tag-array",
        "tag-competitive-programming",
        "tag-data-structure",
        "tag-matrix",
        "tag-spiral",
        "tag-traversal"
    ],
    "_links": {
        "self": [
            {
                "href": "https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/posts/2648"
            }
        ]
    }
}
```

### 2.3 Author

This endpoint will be used to get the author details for a post.

**Endpoints:**

- Author Details: [https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/users/{author_id}](https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/users/1)

**Sample request:**

```
import requests

author_id = 1
url = f"https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/users/{author_id}"

payload={}
headers = {
  'Authorization': 'Basic token'
}

response = requests.request("GET", url, headers=headers, data=payload)

print(response.text)

```

**Sample response:**

```
{
    "id": 1,
    "name": "john doe",
    "url": "https://wpbackend.vip3rtech6069.com",
    "description": "",
    "link": "https://wpbackend.vip3rtech6069.com/author/john/",
    "slug": "john",
    "avatar_urls": {
        "24": "https://secure.gravatar.com/avatar/3b72b2972fbca0251acf6677a0f01a78?s=24&r=g",
    },
    "meta": [],
    "_links": {
        "self": [
            {
                "href": "https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/users/1"
            }
        ],
        "collection": [
            {
                "href": "https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/users"
            }
        ]
    }
}
```

### 2.4 Tag

This endpoint will be used to fetch information related to different tags attached to the post.

**Endpoints:**

- Tag Detail: [https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/tags/{tag_id}](https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/tags/67)

**Sample request:**

```
import requests

tag_id = 67
url = f"https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/tags/{tag_id}"

payload={}
headers = {
  'Authorization': 'Basic token'
}

response = requests.request("GET", url, headers=headers, data=payload)

print(response.text)

```

**Sample response:**

```
{
    "id": 67,
    "count": 1,
    "description": "",
    "link": "https://wpbackend.vip3rtech6069.com/tag/array/",
    "name": "array",
    "slug": "array",
    "taxonomy": "post_tag",
    "meta": [],
    "_links": {
        "self": [
            {
                "href": "https://wpbackend.vip3rtech6069.com/wp-json/wp/v2/tags/67"
            }
        ]
    }
}
```

In [8]:
username = os.getenv("WORDPRESS_API_KEY_USERNAME", "admin")
password = os.getenv("WORDPRESS_API_KEY", "admin")
token = base64.b64encode(f"{username}:{password}".encode("utf-8"))

In [80]:
backend_url = os.getenv("WORDPRESS_BACKEND_API", "http://localhost:8080/wp-json/wp/v2")

endpoints = {
    'post category': 'categories',
    'post': 'posts',
    'author': 'users',
    'tag': 'tags'
}
def fetch_list(endpoint:str, token:str, filters:dict={}, fields:typing.List[str]=[], page:int=1, per_page:int=10):
    url = f"{backend_url}/{endpoints[endpoint]}"
    payload = {}
    headers = {
        'Authorization': f"Basic {token}"
    }
    params = {
        "page": page,
        "per_page": per_page
    }
    if filters:
        for key, value in filters.items():
            params[key] = value

    if fields:
        for i in range(len(fields)):
            params[f"_fields[{i}]"] = fields[i]

    result = {
        "data": [],
        "total": 0,
        "totalPages": 1,
        "page": page,
        "perPage": per_page
    }
    response = requests.request("GET", url, headers=headers, data=payload, params=params)
    if response.status_code != 200:
        return result

    result["data"] = response.json()
    if not result["data"] or type(result["data"]) != list or len(result["data"]) == 0:
        result["data"] = []
        return result

    if response.headers:
        result["total"] = int(response.headers.get("X-WP-Total", str(len(result["data"]))))
        result["totalPages"] = int(response.headers.get("X-WP-TotalPages", "1"))


    return result
    

def fetch_details(endpoint:str, token:str, uid:int, fields:typing.List[str]=[]):
    url = f"{backend_url}/{endpoints[endpoint]}/{uid}"
    print("Fetching from " + url)
    payload = {}
    headers = {
        'Authorization': f"Basic {token}"
    }
    params = {}
    if fields:
        for i in range(len(fields)):
            params[f"_fields[{i}]"] = fields[i]
            
    response = requests.request("GET", url, headers=headers, data=payload, params=params)
    if response.status_code != 200:
        return None
    return response.json()


# print(fetch_list('post', token, fields=['id', 'title'], page=1, per_page=3))
print(fetch_details('tag', token=token, fields=['id', 'name'], uid=56))

{'data': [{'id': 2648, 'title': {'rendered': 'Traversing a spiral matrix'}}, {'id': 2555, 'title': {'rendered': 'Unit Testing with .NET II: Refactoring the code'}}, {'id': 2433, 'title': {'rendered': 'Unit Testing with .NET I: A brief overview'}}], 'total': 9, 'totalPages': 3, 'page': 1, 'perPage': 3}


## 3.0 Fetching posts

1. Fetch all categories available in the CMS. Fetch the category id and name. This is used to fetch category name from id.

2. Create an empty cache for tags. Each tag will have a name and an id. When we fetch a post, we will loop through each tag and check if the tag name exists in the cache. If it does, we will get the name from there. otherwise, we will make a request to the tag detail page to fetch the tag info. We will update the cache and return the data.


3. Create an empty cache for authors. Each author will have a name and an id. When we fetch a post, er eill loop through each author and check if the author name exists in the cache. if it does, we will get the name from there. Otherwise, we will make a request to get the author detail and fetch the author info. We will update the cache and return the data.

4. Fetch id and title of all posts that were modified after (modified_gmt) a specific date. Fetch it in ascending order of create date (date_gmt).


5. Loop through each id and fetch the corresponding post detail. Fetch 'author', 'category names', and 'tag names' for each corresponding post.


6. Convert each post detail to post object and return.

In [47]:
# Fetch post categories
current_page = 1
per_page = 10
endpoint = 'post category'
fields = ['id', 'name', 'parent']
post_categories = fetch_list(
    endpoint=endpoint,
    token=token,
    fields=fields,
    page=current_page,
    per_page=per_page
)
total_pages = post_categories['totalPages']
print(f"Total pages: {total_pages}")

if total_pages > 1: # Fetch all categories
    current_page += 1
    while current_page <= total_pages:
        categories = fetch_list(
            endpoint=endpoint,
            token=token,
            fields=fields,
            page=current_page,
            per_page=per_page
        )
        if len(categories["data"]) > 0:
            post_categories["data"] += categories["data"]
        current_page += 1
        
post_categories = post_categories["data"]
print(post_categories)


Total pages: 2
[{'id': 37, 'name': 'Application Layer', 'parent': 2}, {'id': 15, 'name': 'Arrays', 'parent': 3}, {'id': 63, 'name': 'Backend', 'parent': 21}, {'id': 21, 'name': 'Blogs', 'parent': 0}, {'id': 38, 'name': 'Data Layer', 'parent': 2}, {'id': 3, 'name': 'Data Structure', 'parent': 21}, {'id': 2, 'name': 'Documentation', 'parent': 0}, {'id': 22, 'name': 'Frontend', 'parent': 21}, {'id': 39, 'name': 'Infrastructure Management', 'parent': 2}, {'id': 16, 'name': 'Lists', 'parent': 3}, {'id': 36, 'name': 'Network Layer', 'parent': 2}, {'id': 35, 'name': 'Presentation Layer', 'parent': 2}, {'id': 40, 'name': 'System Architecture', 'parent': 2}, {'id': 17, 'name': 'Trees', 'parent': 3}, {'id': 1, 'name': 'Uncategorized', 'parent': 0}]


In [49]:
# Create an empty cache for tag and author. We will populate it eventually
tags = {}
authors = {}
post_category_mappings = {}
for category in post_categories:
    post_category_mappings[category['id']] = category

post_categories = post_category_mappings
del post_category_mappings
print(post_categories)

{37: {'id': 37, 'name': 'Application Layer', 'parent': 2}, 15: {'id': 15, 'name': 'Arrays', 'parent': 3}, 63: {'id': 63, 'name': 'Backend', 'parent': 21}, 21: {'id': 21, 'name': 'Blogs', 'parent': 0}, 38: {'id': 38, 'name': 'Data Layer', 'parent': 2}, 3: {'id': 3, 'name': 'Data Structure', 'parent': 21}, 2: {'id': 2, 'name': 'Documentation', 'parent': 0}, 22: {'id': 22, 'name': 'Frontend', 'parent': 21}, 39: {'id': 39, 'name': 'Infrastructure Management', 'parent': 2}, 16: {'id': 16, 'name': 'Lists', 'parent': 3}, 36: {'id': 36, 'name': 'Network Layer', 'parent': 2}, 35: {'id': 35, 'name': 'Presentation Layer', 'parent': 2}, 40: {'id': 40, 'name': 'System Architecture', 'parent': 2}, 17: {'id': 17, 'name': 'Trees', 'parent': 3}, 1: {'id': 1, 'name': 'Uncategorized', 'parent': 0}}


In [58]:
# Fetch info of all posts that were modified after a specific datetime
modified_date = datetime(2024, 1, 1, 23, 59).isoformat()
fields = [value for _, value in Post.get_field_mapping().items()]
current_page = 1
per_page = 3
endpoint = "post"
filters = {
    "orderby": "date",
    "order": "asc",
    "modified_after": modified_date
}

posts = fetch_list(endpoint, token=token, page=current_page, per_page=per_page, fields=fields, filters=filters)
total_pages = posts["totalPages"]

if total_pages > 1:
    current_page += 1
    while current_page <= total_pages:
        data = fetch_list(endpoint, token=token, page=current_page, per_page=per_page, fields=fields, filters=filters)
        if len(data["data"]) > 0:
            posts["data"] += data["data"]
        current_page += 1
posts = posts["data"]
print(len(posts))

9


In [79]:
# Convert each post to post object
def retrieve_post_categories(categories:typing.List[int])->typing.List[str]:
    if not categories or len(categories) == 0:
        return []
    result = []
    for category_id in categories:
        if category_id in post_categories:
            result.append(post_categories[category_id]["name"])
    return result

def retrieve_tags(tag_ids:typing.List[int])->typing.List[str]:
    if not tag_ids or len(tag_ids) == 0:
        return []

    result = []
    for tag_id in tag_ids:
        if tag_id in tags:
            result.append(tags[tag_id]["name"])
        else: # fetch the tag
            tag = fetch_details("tag", token=token, fields=["id", "name"], uid=tag_id)
            if tag and tag.get("name"):
                tags[tag_id] = tag
                result.append(tag["name"])
            
    return result

def retrieve_authors(author_ids:typing.List[int])->typing.List[str]:
    if not author_ids or len(author_ids) == 0:
        return []

    result = []
    for author_id in author_ids:
        if author_id in authors:
            result.append(authors[author_id]["name"])
        else: # fetch the tag
            author = fetch_details("author", token=token, fields=["id", "name"], uid=author_id)
            if author and author.get("name"):
                authors[author_id] = author
                result.append(author["name"])
            
    return result
            
def wordpress_to_post_object(post):
    # Fetch the category names
    post_categories = retrieve_post_categories(post["categories"])  if post["categories"] else []
    if len(post_categories) > 0:
        post["categories"] = ",".join(post_categories) # categories are comma-separated names

    # Fetch tags
    post_tags = retrieve_tags(post["tags"]) if post["tags"] else []
    if len(post_tags) > 0:
        post["tags"] = ",".join(post_tags) # tags are comma-separated names


    # Fetch author
    author = retrieve_authors([post["author"]]) if post["author"] else []
    if len(author) > 0:
        post["author"] = author[0]

    # Create post object
    field_mappings = Post.get_field_mapping()
    mapped_post = {}
    for system_key, wordpress_key in field_mappings.items():

        if post[wordpress_key]:
            if wordpress_key == "excerpt" or wordpress_key == "content" or wordpress_key == "title": # The html content of content and excerpt exists inside rendered
                mapped_post[system_key] = post[wordpress_key]["rendered"] if post[wordpress_key]["rendered"] else str(post[wordpress_key])
            elif wordpress_key == "date_gmt": # convert it to datetime from iso string
                try:
                    mapped_post[system_key] = datetime.fromisoformat(post[wordpress_key])
                except:
                    mapped_post[system_key] = datetime.utcnow()
                    
            else:
                mapped_post[system_key] = post[wordpress_key]

    return Post.from_dict(mapped_post)
            

post_objects = []
for post in posts[:1]:
    post_objects.append(wordpress_to_post_object(post))
    



{'code': 'rest_no_route', 'message': 'No route was found matching the URL and request method.', 'data': {'status': 404}}
{'code': 'rest_no_route', 'message': 'No route was found matching the URL and request method.', 'data': {'status': 404}}
{'code': 'rest_no_route', 'message': 'No route was found matching the URL and request method.', 'data': {'status': 404}}
{'code': 'rest_no_route', 'message': 'No route was found matching the URL and request method.', 'data': {'status': 404}}
{'code': 'rest_no_route', 'message': 'No route was found matching the URL and request method.', 'data': {'status': 404}}
{'code': 'rest_no_route', 'message': 'No route was found matching the URL and request method.', 'data': {'status': 404}}
{'code': 'rest_no_route', 'message': 'No route was found matching the URL and request method.', 'data': {'status': 404}}
{'code': 'rest_no_route', 'message': 'No route was found matching the URL and request method.', 'data': {'status': 404}}
{'code': 'rest_no_route', 'messa

KeyboardInterrupt: 