In [9]:
%pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [10]:
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.utils import class_weight
import requests
import time

def web_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--verbose")
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1920, 1200")
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    return driver

URL = "https://libguides.ucmerced.edu/az/databases"
driver = web_driver()
driver.get(URL)
time.sleep(5)


In [12]:
while True:
    try:
        loadMore = driver.find_element(By.XPATH,"//button[contains(text(), 'Load More')]")
        loadMore.click()
        time.sleep(5)
    except:
        break

In [13]:
# Get title elements
products = driver.find_elements(By.CSS_SELECTOR, ".az-title")
titles = [product.get_attribute("innerText") for product in products]  # Works fine

# Get description elements
products = driver.find_elements(By.CSS_SELECTOR, ".az-description.az-description-max-height.az-description-short")
descriptions = [product.get_attribute("textContent") for product in products]  # Fixed typo & method usage



print(titles)
print(descriptions)


['Alaska Native Language Archive\nThis link opens in a new window', 'A-Z Maps Online (World Trade Press)\nThis link opens in a new window', 'A&AePortal (Yale)\nThis link opens in a new window', 'ABELL: Annual Bibliography of English Language and Literature\nThis link opens in a new window', 'Abraham Lincoln Papers at the Library of Congress\nThis link opens in a new window', 'Academic Conferences in China (Wangfang Data)\nThis link opens in a new window', 'Academic Search Complete \nThis link opens in a new window', 'Academic Video Online (AVON)\nThis link opens in a new window', 'Access Engineering\nThis link opens in a new window', 'ACLS Humanities Ebook Collection \nThis link opens in a new window', 'ACM Digital Library\nThis link opens in a new window', 'Acta Sanctorum\nThis link opens in a new window', 'Ad*Access Project\nThis link opens in a new window', 'Add Health: The National Longitudinal Study of Adolescent to Adult Health\nThis link opens in a new window', 'Africa Developme

In [18]:
data = {'Titles': titles, 'Descriptions': descriptions}
df = pd.DataFrame(data)
df['Combined_Text'] = df['Titles'] + " " + df['Descriptions']

# Combine user input with dataset for vectorization
texts = df['Combined_Text'].tolist()
user_input = input()
texts.append(user_input)

# Convert to TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(texts)

# Compute cosine similarity (user input vs dataset)
cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

# Add similarity scores to DataFrame
df['similarity'] = cosine_similarities.flatten()

# Get top 3 matches
top_matches = df.nlargest(3, 'similarity')
print(top_matches[['Titles', 'Descriptions']])

                                                Titles  \
2    A&AePortal (Yale)\nThis link opens in a new wi...   
245  Digital Archive of Latin American and Caribbea...   
84   Art History Resources on the Web\nThis link op...   

                                          Descriptions  
2    \n                    A collection of eBooks h...  
245  \n                    A collection of digitize...  
84   \n                    Links to extensive resou...  
