In [None]:
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['looma_education']

# Fetch textbooks and resources
textbooks = db.textbooks.find()
resources = db.resources.find()


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and non-alphabetic tokens, and lemmatize
    tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess the text data
textbook_texts = [preprocess(book['content']) for book in textbooks]
resource_texts = [preprocess(resource['content']) for resource in resources]

# Vectorize the texts
vectorizer = TfidfVectorizer()
textbook_vectors = vectorizer.fit_transform(textbook_texts)
resource_vectors = vectorizer.transform(resource_texts)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity
similarity_matrix = cosine_similarity(textbook_vectors, resource_vectors)


In [None]:
import numpy as np

matches = []
for idx, similarity_scores in enumerate(similarity_matrix):
    # Find the index of the most similar resource
    best_match_idx = np.argmax(similarity_scores)
    matches.append({
        'textbook_chapter_id': textbooks[idx]['_id'],
        'resource_id': resources[best_match_idx]['_id'],
        'similarity_score': similarity_scores[best_match_idx]
    })

# Save matches to MongoDB
db.matches.insert_many(matches)
