In [1]:
# Import files
import os # system
# you can import files google drive through colab, adjusting the csv file path appropriately
#from google.colab import drive
#drive.mount('/content/drive')
#from google.colab import files

# Manual
This notebook will retrain the model and export the appropriate files. Model retraining is only necessary if incorporating new datasets. Otherwise, the model and files in the application do not need to be updated.

Follow the in line comments for pointers and tips.

Setup:

Install requirements.txt or use !pip install code provided in cells.
Set up HuggingFace API Token in .env file or through notebook using os.environ["HUGGINGFACEHUB_API_TOKEN"] = 'your token'

Adjust the CSV file path and append data as necessary.

Manual tuning will be required. This occurs in the Spectral Clustering Tuning section, where the spectral clustering model is to be tuned based on the Silhouette Scores.

File output is at the end of the notebook. The clustered csv, embeddings, similarity matrix, and FAISS index will be saved to your directory. You do not need to save the clustering model. The maximum total file size is 100 MB.

This may take up to 30 minutes to initialize and an additional 15 minutes to tune and complete.

This notebook was made through and can be run through Google CoLab, for free access to computing resources in your browser. Formatting may require adjustments.

# Import Libraries
Define functions, import libraries and files.

In [2]:
# CSV File path, adjust as necessary based on your directory
csv_file ='data/isbndb-caribbean-books.csv'

In [3]:
# Provide HuggingFace API token
#os.environ["HUGGINGFACEHUB_API_TOKEN"] = 'your token'

In [4]:
# Install Libraries
import pandas as pd
import numpy as np
import pickle

In [7]:
# Model Libraries
from sklearn.cluster import SpectralClustering
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Metrics Libraries
!pip install matplotlib
import matplotlib.pyplot as plt
import timeit
from sklearn.metrics import silhouette_score, davies_bouldin_score

Collecting matplotlib
  Downloading matplotlib-3.9.2-cp39-cp39-macosx_10_12_x86_64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.55.0-cp39-cp39-macosx_10_9_x86_64.whl.metadata (164 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.7-cp39-cp39-macosx_10_9_x86_64.whl.metadata (6.3 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.2.0-py3-none-any.whl.metadata (5.0 kB)
Downloading matplotlib-3.9.2-cp39-cp39-macosx_10_12_x86_64.whl (7.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading contourpy-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl (265 kB)

Matplotlib is building the font cache; this may take a moment.


In [6]:
# Sentence Transformers and Langchain libraries
!pip install chromadb langchain tiktoken sentence-transformers faiss-cpu
!pip install langchain langchain_community
from langchain.llms import HuggingFaceHub
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma, FAISS
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document # Import Document class

model = SentenceTransformer('all-MiniLM-L6-v2')
llm = HuggingFaceHub(repo_id = "microsoft/Phi-3-mini-4k-instruct", model_kwargs={"temperature":0.6})



  llm = HuggingFaceHub(repo_id = "microsoft/Phi-3-mini-4k-instruct", model_kwargs={"temperature":0.6})


ValidationError: 1 validation error for HuggingFaceHub
  Value error, Did not find huggingfacehub_api_token, please add an environment variable `HUGGINGFACEHUB_API_TOKEN` which contains it, or pass `huggingfacehub_api_token` as a named parameter. [type=value_error, input_value={'repo_id': 'microsoft/Ph...acehub_api_token': None}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/value_error

In [None]:
# Language detection and translation functions
!pip install langdetect
!pip install deep_translator
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from deep_translator import GoogleTranslator

DetectorFactory.seed = 0

# Input language determinate
class LanguageDetector:
    def __init__(self, text=""):
        self.text = text

    def set_text(self, text):
        self.text = text

    def detect_language(self):
        try:
            language_code = detect(self.text)
            return language_code
        except LangDetectException:
            language_code = "en"
            return language_code

# Translate text to English
def translate_to_english(text):
    try:
        translated = GoogleTranslator(source='auto', target='en').translate(text)
        return translated
    except:
        return text

In [None]:
#Text Preprocessing
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
import sys
!{sys.executable} -m pip install contractions
import contractions

def preprocess(s):
    s=str(s)
    s = s.lower() # lowercase !
    s = contractions.fix(s) # expand contractions
    s = re.sub(r'\n', ' ', s) # remove \n
    s = re.sub(r'http\S+', '', s) # remove url
    s = re.sub(r'<.*?>', '', s) # remove html
    s = re.sub(r'\d+', '', s) # remove numbers
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'[^\w\s]', '', s) # remove punctuation and special characters
    s = word_tokenize(s) # tokenize
    s = [w for w in s if w not in set(stopwords.words('english'))] # stop words
    s = [stemmer.stem(w) for w in s] # stemming
    return " ".join(s) # white spaces

# Preprocess Data
Clean and translate

In [None]:
# Import book data CSV
df = pd.read_csv(csv_file, encoding= 'latin1')

In [None]:
# Clean and preprocess data
df_match = df.copy()
# Remove duplicates
df_match.drop_duplicates(subset = 'title', inplace=True)
df_match.drop_duplicates(subset = 'synopsis', inplace=True)
# Remove independently published books and books missing publishers
df_match = df_match[~df_match['publisher'].str.contains("ndependent", na=False)]
df_match.dropna(subset='publisher', inplace=True)
# Remove books missing both subject and synopsis
df_match.dropna(subset=['subjects', 'synopsis'], inplace=True, how='all')
# Drop irrelevant columns
df_match.drop(['title_long', 'binding', 'pages', 'image', 'isbn', 'isbn10', 'isbn13', 'authors', 'msrp', 'edition', 'date_published'], axis=1, inplace=True)
# Filling missing data with empty strings
df_match['subjects'].fillna(value='', inplace=True)
df_match['synopsis'].fillna(value='', inplace=True)

In [None]:
# Detect languages and translate to English
detector = LanguageDetector()
for idx, row in df_match.iterrows():
    detector.set_text(row['synopsis'])
    detected_language = detector.detect_language()
    df_match.at[idx, 'language'] = detected_language
    df_match.at[idx, 'synopsis'] = translate_to_english(row['synopsis'])
    df_match.at[idx, 'subjects'] = translate_to_english(row['subjects'])

In [None]:
# df_match will be used to create FAISS index
# Dataframe for model, does not include publisher or language
df_model = df_match.copy()
df_model.drop(['publisher', 'language'], axis=1, inplace=True)

In [None]:
# Dataframe for matching publisher, will be used to integrate data with clustering results
df_publisher = df_match.copy()
df_publisher.reset_index(drop=True, inplace=True)

In [None]:
# Apply preprocessing to modeling dataframe
df_model=df_model.map(lambda s:preprocess(s)).copy()
df_model.reset_index(drop=True, inplace=True)

# SentenceTransformer Embeddings

In [None]:
# Generate separate embeddings for each column, may be used for multiview that analyze separately
expanded_subjects = "This text discusses " + df_model['subjects']
subjects_embeddings = model.encode(expanded_subjects.tolist(), batch_size=32, show_progress_bar=True)
synopsis_embeddings = model.encode(df_model['synopsis'].tolist(), batch_size=32, show_progress_bar=True)

# Subject:Synopsis = 9:1
weighted_embeddings = []
for subject, synopsis, subject_text in zip(subjects_embeddings, synopsis_embeddings, expanded_subjects):
    if subject_text.strip() == "":  # If `subject` is empty
        subject_weight = 0
        synopsis_weight = 1
    else:
        subject_weight = 0.9
        synopsis_weight = 0.1

    weighted_embedding = subject_weight * subject + synopsis_weight * synopsis
    weighted_embeddings.append(weighted_embedding)

weighted_embeddings = np.array(weighted_embeddings)

# Cosine Similarity Matrix

In [None]:
# cosine similarity matrix
similarity_matrix = cosine_similarity(weighted_embeddings)
# threshold --> create sparse matrix
threshold = np.percentile(similarity_matrix, 95)
similarity_matrix[similarity_matrix < threshold] = 0

# Spectral Clustering Tuning
Select ideal cluster number based on Silhouette Scores.

In [None]:
# Visual analysis: Silhouette Score
# Graph Silhouette Scores. Closer to 1 is best.
# Determine ideal number of clusters at peak drop off
# Typically around 50 to 60
scores = []
# Adjust range to hone in on ideal clusters. range(start,stop,step)
cluster_range =range(50,100,5)
for n in cluster_range:
    start= timeit.default_timer()
    model = SpectralClustering(n_clusters=n, affinity='precomputed', assign_labels='discretize', random_state=42)
    cluster_labels = model.fit_predict(similarity_matrix)
    score = silhouette_score(similarity_matrix, cluster_labels, metric='cosine')
    scores.append(score)
    stop = timeit.default_timer()
    print(f"Silhouette Score for {n} clusters: {score} time: {stop - start}")

plt.plot(cluster_range, scores, marker='o')
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Score Analysis for Optimal n_clusters")
plt.show()

In [None]:
# Tune final cluster model
# Select n_clusters based on Silhouette Score Analysis
# Typically from 50 to 60
spectral = SpectralClustering(
    n_clusters= 60, # Adjust number of clusters according to Silhouette Score Analysis
    affinity='precomputed',
    assign_labels='discretize',
    random_state=42
)

In [None]:
# Fit  and Predict Spectral Model onto Data
df_publisher['cluster'] = spectral.fit_predict(similarity_matrix)
silhouette_avg = silhouette_score(similarity_matrix, df_publisher['cluster'], metric='cosine')
davies_bouldin = davies_bouldin_score(weighted_embeddings, df_publisher['cluster'])
print("Silhouette Score: ", silhouette_avg) # Closer to 1 is best
print("Davies-Bouldin Score: ", davies_bouldin) # Lower is best

# FAISS Index
This may produce a file index.csv, which can be discarded after.

In [None]:
# Load data
df_match.to_csv('index.csv',index=False)
loader = CSVLoader(file_path="index.csv")
data = loader.load()
# Transform data
text_splitter = CharacterTextSplitter(chunk_size=2500, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [None]:
#embeddings = HuggingFaceEmbeddings(model_name='bert-base-uncased')
embeddings = HuggingFaceEmbeddings(model_name='paraphrase-MiniLM-L6-v2')

In [None]:
# Fill Vector DB
docsearch = FAISS.from_documents(texts, embeddings)
retriever=docsearch.as_retriever()

# File Output
All required model files are saved below.

In [None]:
# The Spectral Clustering Model does not need to be saved for the purposes of the application
# Save Spectral Model with Pickle
#with open('spectral_model.pkl', 'wb') as file:
#    pickle.dump(spectral, file)

In [None]:
# Save csv with cluster assignments
df_publisher.to_csv('df_spec_modeling.csv',index=False)

In [None]:
# Save similarity matrix and embeddings of data
np.save('similarity_matrix.npy', similarity_matrix)
np.save('weighted_embeddings.npy', weighted_embeddings)

In [None]:
# Save FAISS index, system requires both faiss and  pkl file
docsearch.save_local("faiss_index")