In [None]:
import json
import jsonschema
import pandas as pd
import os
from packaging import version
import yaml
import numpy as np

def convert_ghost_export(import_file, export_json_schema_file):
    import_file_json = import_json_file(import_file)
    export_json_schema = import_json_file(export_json_schema_file)
    try:
        jsonschema.validate(export_json_schema, import_file_json)
    except jsonschema.exceptions.ValidationError as err:
        print(err)
        raise Exception("The export file does not match the schema file.") from err
    return import_file_json

def import_json_file(file_path):
    with open(file_path) as f:
        return json.load(f)


ghost_data=convert_ghost_export( "../data/export.json","../data/ghost-4.37.0.schema")


In [None]:
ghost_db = ghost_data["db"][0]
ghost_db_version=ghost_db['meta']['version']
if (version.parse(ghost_db_version) > version.parse("4.37.0")):
    print("Ghost database version is greater than 4.37.0.  This script may not work.")
elif (version.parse(ghost_db_version) < version.parse("4.37.0")):
    print("Ghost database version is less than 4.37.0.  This script may not work.")
elif (version.parse(ghost_db_version) == version.parse("4.37.0")):
    print("Ghost database version is 4.37.0.  This script should work.")

ghost_data=ghost_db['data']

In [None]:
ghost_posts_and_pages=ghost_data['posts']
ghost_post_authors=ghost_data['posts_authors']
ghost_posts_meta=ghost_data['posts_meta']
ghost_posts_tags=ghost_data['posts_tags']
ghost_tags=ghost_data['tags']
ghost_users=ghost_data['users']
ghost_settings=ghost_data['settings']

### Handle settings

In [None]:
# create a pandas dataframe from the ghost_settings
df_settings = pd.DataFrame(ghost_settings,columns=["key","value"])
ghost_settings_we_care_about = [
    "title",
    "description",
    "cover_image",
    "icon",
    "lang",
    "timezone",
    "codeinjection_head",
    "codeinjection_foot",
    "facebook",
    "twitter",
    "navigation",
    "secondary_navigation"
]
df_settings.set_index("key",inplace=True)

df_settings_we_care_about = df_settings.loc[ghost_settings_we_care_about]
df_settings_we_care_about

In [None]:
# convert dataframe to dictionary
settings_we_care_about = df_settings_we_care_about.to_dict()["value"]
# convert serialized strings to array
settings_we_care_about["navigation"] = json.loads(settings_we_care_about["navigation"])
settings_we_care_about["secondary_navigation"] = json.loads(settings_we_care_about["secondary_navigation"])

In [None]:
with open("../data/settings.yaml", "w") as f:
    yaml.dump(settings_we_care_about, f, default_flow_style=False)
# I really really don't like YAML, but it's broadly human readable.


## Handle tags

In [None]:
df_tags = pd.DataFrame(ghost_tags,columns=["id","name","slug","description","feature_image"])
df_tags.set_index("slug",inplace=True)
dict_tags = df_tags.to_dict(orient="index")
with open("../data/tags.yaml", "w") as f:
    yaml.dump(dict_tags, f, default_flow_style=False)
df_tags.set_index("id",inplace=True)
dict_tags = df_tags.to_dict(orient="index")

### Handle users/authors

In [None]:
# export users to yaml
df_users = pd.DataFrame(ghost_users,columns=["id","name","slug","email","profile_image","cover_image","bio","website","location","facebook","twitter"])
df_users.set_index("id",inplace=True)
dict_users = df_users.to_dict(orient="index")
with open("../data/users.yaml", "w") as f:
    yaml.dump(dict_users, f, default_flow_style=False)

# Handle posts
This is the big one. It needs to get all the posts, apply tags into an array, and apply post metadat and authors correctly.

### Apply page and post metadata

In [None]:
# ghost_posts_meta to dataframe
df_posts_meta = pd.DataFrame(ghost_posts_meta)
df_posts_meta.set_index("post_id",inplace=True)
# drop id column
df_posts_meta.drop(columns=["id","email_only"],inplace=True)
# drop empty columns
df_posts_meta.dropna(axis=1,how="all",inplace=True)
# convert to dictionary
dict_cleaned_posts_meta = df_posts_meta.to_dict(orient="index")
# drop values which are None
for post_id in dict_cleaned_posts_meta:
    dict_cleaned_posts_meta[post_id] = {key: value for key, value in dict_cleaned_posts_meta[post_id].items() if value is not None}

#### Split posts and pages

In [None]:
# what are the unique keys for each post in dict_cleaned_posts_meta
unique_keys = set()
for post_id in dict_cleaned_posts_meta:
    unique_keys.update(dict_cleaned_posts_meta[post_id].keys())
unique_keys

In [None]:
# apply post meta to posts
for post in ghost_posts_and_pages:
    post_id = post["id"]
    if post_id in dict_cleaned_posts_meta:
        post.update(dict_cleaned_posts_meta[post_id])

In [None]:
# filter ghost_posts by type field into a dictionary only containing pages
ghost_pages = [x for x in ghost_posts_and_pages if x["type"] == "page"]
ghost_posts = [x for x in ghost_posts_and_pages if x["type"] == "post"]

### Handle pages

In [None]:
# create a pandas dataframe from the ghost_pages
df_pages = pd.DataFrame(ghost_pages,columns=["title","slug","html","feature_image","featured","page","status","locale","visibility","meta_title","meta_description","author_id","created_at","created_by","updated_at","updated_by","published_at","published_by","custom_excerpt","codeinjection_head","codeinjection_foot","og_image","og_title","og_description","twitter_image","twitter_title","twitter_description","custom_template","canonical_url","url"])
df_pages.set_index("slug",inplace=True)
df_pages.dropna(axis=1,how="all",inplace=True)
df_pages["author_name"] = df_pages["author_id"].map(df_users["name"])
df_pages = df_pages.where((pd.notnull(df_pages)), None)
df_pages.head()

dict_pages = df_pages.to_dict(orient="index")
# remove fields from dictionary that are None or NaN
for page_slug in dict_pages:
    dict_pages[page_slug] = {key: value for key, value in dict_pages[page_slug].items() if value is not None}

# create pages directory if it doesn't exist
if not os.path.exists("../data/pages"):
    os.makedirs("../data/pages")

# export each html column to a html file and then remove the html key
for page in dict_pages:
    html = dict_pages[page]["html"]
    with open("../data/pages/" + page + ".html", "w") as f:
        f.write(html)
    dict_pages[page].pop("html")
# TODO: Replace __GHOST_URL__ with a base site URL

with open("../data/pages.yaml", "w") as f:
    yaml.dump(dict_pages, f, default_flow_style=False)

In [None]:
df_posts = pd.DataFrame(ghost_posts)
df_posts.set_index("slug",inplace=True)
df_posts.dropna(axis=1,how="all",inplace=True)
df_posts["author_name"] = df_posts["author_id"].map(df_users["name"])
df_posts.drop(columns=["mobiledoc"],inplace=True)
df_posts = df_posts.where((pd.notnull(df_posts)), None)
df_posts.head()


#### Apply tags

In [None]:
# put df_posts back into dictionary
from array import array

dict_posts = df_posts.to_dict(orient="index")
# apply tags to posts
for post in dict_posts:
    dict_posts[post]["tags"] = []
    for tag in ghost_posts_tags:
        if tag["post_id"] == dict_posts[post]["id"]:
            tag_id=tag["tag_id"]
            tag_name=df_tags.loc[tag_id]["name"]
            dict_posts[post]["tags"].append(tag_name)
# remove fields from dictionary that are None or NaN
for page_slug in dict_posts:
    dict_posts[page_slug] = {key: value for key, value in dict_posts[page_slug].items() if value is not None}
    dict_posts[page_slug] = {key: value for key, value in dict_posts[page_slug].items() if value != ""}

#### Apply authors

#### Export to YAML

### Cleanup


In [None]:
def cleanup_export_files():
    if os.path.exists("../data/settings.yaml"):
        os.remove("../data/settings.yaml")
    if os.path.exists("../data/tags.yaml"):
        os.remove("../data/tags.yaml")
    if os.path.exists("../data/users.yaml"):
        os.remove("../data/user.yaml")