# 4 Model Recommendation
In this notebook we model the recommendation system using the provided article and product data.

In [51]:
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import json
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import SparseMatrixSimilarity

In [53]:
# Load datasets
with open("./intermediate_data/article-data-clean.json", "r", encoding="utf-8") as f:
    data = json.load(f)

with open("./intermediate_data/product-data-clean.json", "r", encoding="utf-8") as f:
    products = json.load(f)

articles = data

# Preprocess function
def preprocess(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

# Preprocess article texts
texts = [preprocess(article["content"]) for article in articles]

# Create dictionary and corpus
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# TF-IDF model and corpus
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

# Create similarity index
index = SparseMatrixSimilarity(corpus_tfidf, num_features=len(dictionary))

# Recommend articles similar to a query text
def recommend_similar_articles(query_text, top_n=5):
    query_bow = dictionary.doc2bow(preprocess(query_text))
    query_tfidf = tfidf[query_bow]
    similarities = index[query_tfidf]
    ranked = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)
    return [(articles[i], score) for i, score in ranked[:top_n]]

# Recommend articles related to a product
def recommend_articles_for_product(product, top_n=5):
    product_text = product.get('clean_text') or product.get('content') or ""
    product_tokens = preprocess(product_text)
    product_bow = dictionary.doc2bow(product_tokens)
    product_tfidf = tfidf[product_bow]
    similarities = index[product_tfidf]
    ranked = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)
    return [(articles[i], score) for i, score in ranked[:top_n]]

# Example usage:
query_article_text = articles[1]["content"]
print("Articles similar to article 1:")
results = recommend_similar_articles(query_article_text)
for res_article, score in results:
    print(f"Score: {score:.3f} - Title: {res_article['title']}")

product = products[1]
print(f"\nTop articles related to product: {product['name']}")
results = recommend_articles_for_product(product)
for article, score in results:
    print(f"Score: {score:.3f} - Title: {article['title']}")


Articles similar to article 1:
Score: 1.000 - Title: Podcast Round Up: Tahnée Seagrave on Speedsuits, Ken Roczen To MTB, Mike Levy Talks Scorpions, & More
Score: 0.113 - Title: Seeding Results: 2025 Monster Energy Pro Downhill Series Round 1 - UCI Continental Series
Score: 0.097 - Title: The Complete Guide to the 2025 World Cup DH Teams
Score: 0.083 - Title: 7 Interviews from the iXS Cup: World Cup Racers on Off-Season Changes
Score: 0.077 - Title: Pro Riders Announced for the 2025 NZ MTB Rally

Top articles related to product: Hayduke Ti
Score: 0.278 - Title: First Ride: Polygon's $3,199 Collosus N9
Score: 0.270 - Title: Frameworks Launches 2025 DH Frame
Score: 0.247 - Title: First Ride: Rotwild R.EX Mid-Pivot eMTB
Score: 0.234 - Title: First Look: Revel Launches 3 New Bikes Including an Enduro & eMTB Model
Score: 0.223 - Title: The Tripping Balls is a Steel Gearbox-Equipped Trail Bike with a Fresh Canfield Kinematic - Taipei Cycle Show 2025


## Next step
After you saved the recommendation model here, run the next step in the workflow [05-PresentRec.py](./05-PresentRec.py) or go back to [00-Workflow.ipynb](./00-Workflow.ipynb).

---

**Authors:**
[Salah Mohamoud](mailto:salah.mohamoud.dev@gmail.com),
[Sai Keertana Lakku](mailto:saikeertana005@gmail.com),
[Zhen Zhuang](mailto:zhuangzhen17cs@gmail.com),
[Nick Capaldini](mailto:nick.capaldini@ridethenextwave.com), Ride The Next Wave, May 19, 2025

---