In [1]:
sp = set(string.punctuation)
wordCount = defaultdict(int)
for post in content:
    r = ''.join([c for c in post.lower() if not c in sp])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1], ws[1:]))]
    for w in ws2:
        wordCount[w] += 1
        
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [w[1] for w in counts[:1000]]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)
# print(counts)

def featBigrams(data):
    feat = [0] * len(words)
    r = ''.join([c for c in data.lower() if not c in sp])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1], ws[1:]))]
    for w in ws2:
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1)
    return feat

NameError: name 'string' is not defined

## Metadata Feature Recommender
This recommendation model is meant to extract features from the metadata in each post and determine similarity between them. In particular, there are three metadata features that do influence the recommendation: `tags`, `contributor`, and `wordCount`. The tags would be a good recommendation for readers who are interested in works of the same genre. The contributors, also known as the writers and/or artists, could interest the reader and thus recommend similar works by the authors. Word count can also play a role in determining if the reader wants to enjoy works of a similar length. The current recommendation system weighs each of these features equally, although future models may change the weights so that recommended works do not constrain the reader to a specific interest bubble.

In [1]:
import requests
import json
from collections import defaultdict
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
res = requests.get('http://calla-lily-git-api-other-people.vercel.app/api/post-data.json')
response = json.loads(res.text)
postSlugs = response.keys()
# Convert the json data to a list for indexing
listRes = [k for k,v in response.items()]

In [3]:
# Remove irrelevant metadata to make the data faster and easier to work with
def cleanData(data, key):
    try:
        del data[key]
    except:
        return

In [4]:
# Converts array metadata by converting each element into a single word, also removes additional details
def reformatArray(arr, key):
    try:
        if key == 'contributor':
            arr = re.split(',|&', arr)
        reformatted = []
        for item in arr:
            if key == 'contributor':
                item = re.sub(r'\(.*\)', '', item)
            item = item.strip()
            words = item.split(' ')
            if key == 'tags' and len(words) > 1:
                for w in words:
                    reformatted.append(w)
            item = ''.join(words)
            reformatted.append(item)
        return reformatted
    except:
        return arr

In [5]:
soupDf = [] # feature dataset
onlyMeta = [] 
removeKeys = ['layout', 'date', 'noHTML', 'thumbnail', 'theme', 'collection', 'prevPost', 'nextPost', 'title', 'contentWarning']
for post in postSlugs:
    copyData = response[post]
    for k in removeKeys:
        cleanData(copyData, k)
        
    copyData['contributor'] = reformatArray(copyData['contributor'], 'contributor')
    copyData['tags'] = reformatArray(copyData['tags'], 'tags')
    
    # Force edit word count to the manually set word count on custom-built works
    if 'manualWC' in copyData:
        copyData['wordCount'] = copyData['manualWC']
        del copyData['manualWC']
    
    copyData['soup'] = ' '.join(copyData['contributor']) + ' ' + ' '.join(copyData['tags']) + ' ' + str(copyData['wordCount'])
    copyData['soup'] = copyData['soup'].lower()
    soupDf.append(copyData['soup'])
    onlyMeta.append(copyData)

We use cosine similarity to compare the works because their data can be of variable length and it is beneficial to check the frequency of terms. Therefore, the angle of difference between the feature vectors is a stronger metric to use as compared to an euclidean approach.

Recommender model sourced by [https://michael-fuchs-python.netlify.app/2020/10/05/recommendation-systems-metadata-based-recommender/](https://michael-fuchs-python.netlify.app/2020/10/05/recommendation-systems-metadata-based-recommender/)

In [6]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(soupDf)
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [7]:
# Recommendation model with k closest works
def metaRecommender(slug, k=10, showSim=False, cosine_sim=cosine_sim, df=soupDf):
    idx = listRes.index(slug)
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:k+1]
    if showSim:
        sim_slugs = [(listRes[item[0]], round(item[1],4)) for item in sim_scores] 
    else:
        sim_slugs = [listRes[item[0]] for item in sim_scores]
    return sim_slugs

In [8]:
k = 10
metaRecommender('/2023/hole', k)

['/2023/soliloquy',
 '/2023/missed-connections',
 '/2022/west-coast-elegies',
 '/2023/out-of-use',
 '/2023/planet',
 '/2023/the-greatest-author',
 '/2023/too-easily',
 '/2023/to-build-castles-in-the-sky',
 '/2022/shore-walking',
 '/4/para-mi-hermane']

In [9]:
k = 10
showSim = False
metaRecs = {}
for slug in listRes:
    recommendations = metaRecommender(slug, k, showSim)
    metaRecs[slug] = recommendations
    # print(slug, recommendations)

In [10]:
# Convert the result to JSON and write to file

with open('recommender.json', mode='w', encoding='utf8') as outfile:
    outfile.write(json.dumps(metaRecs))