In [1]:
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')
from google.colab import files

Mounted at /content/drive


In [91]:
import pandas as pd
import numpy as np

In [92]:
# Original CSV
df = pd.read_csv('/content/drive/MyDrive/BTT CARI-CONNECT /CariConnect Dataset /isbndb-caribbean-books.csv', encoding= 'latin1')

In [93]:
df_match = df.copy()
# Remove duplicates
df_match.drop_duplicates(subset = 'title', inplace=True)
df_match.drop_duplicates(subset = 'synopsis', inplace=True)
# Remove independently published books and books missing publishers
df_match = df_match[~df_match['publisher'].str.contains("ndependent", na=False)]
df_match.dropna(subset='publisher', inplace=True)
# Remove books missing both subject and synopsis
df_match.dropna(subset=['subjects', 'synopsis'], inplace=True, how='all')
# Drop irrelevant columns
df_match.drop(['title_long', 'binding', 'pages', 'image', 'isbn', 'isbn10', 'isbn13', 'authors', 'msrp', 'edition', 'date_published'], axis=1, inplace=True)
# Filling missing data with empty strings
df_match['subjects'].fillna(value='', inplace=True)
df_match['synopsis'].fillna(value='', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_match.dropna(subset='publisher', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_match.dropna(subset=['subjects', 'synopsis'], inplace=True, how='all')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_match.drop(['title_long', 'binding', 'pages', 'image', 'isbn', 'isbn10', 'isbn13', 'authors', 'msrp', 'edition', 'date_published'], axis=1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interme

In [94]:
!pip install langdetect
!pip install deep_translator
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from deep_translator import GoogleTranslator

DetectorFactory.seed = 0

# Input language determinate
class LanguageDetector:
    def __init__(self, text=""):
        self.text = text

    def set_text(self, text):
        self.text = text

    def detect_language(self):
        try:
            language_code = detect(self.text)
            return language_code
        except LangDetectException:
            language_code = "en"
            return language_code

# Get the Language Preferred Publisher
class LanguagePreferredPublisher:
    def __init__(self, dataset):
        self.dataset = dataset

    def get_publishers_by_language(self, language_code):
        """
        Filters and returns a list of unique publishers that accept the given language.
        """
        # Filter dataset for rows with the specified language
        filtered_df = self.dataset[self.dataset['language'] == language_code]

        # Get unique publishers who accept this language
        publishers = filtered_df['publisher'].unique().tolist()

        if publishers:
            print(f"Publishers that accept language '{language_code}': {publishers}")
        else:
            print(f"No publishers found for language '{language_code}'.")

        return publishers

def translate_to_english(text):
    try:
        translated = GoogleTranslator(source='auto', target='en').translate(text)
        return translated
    except:
        return text



In [95]:
# Detect languages and translate synopsis to English
detector = LanguageDetector()
for idx, row in df_match.iterrows():
  detector.set_text(row['synopsis'])
  detected_language = detector.detect_language()
  df_match.at[idx, 'language'] = detected_language
  df_match.at[idx, 'synopsis'] = translate_to_english(row['synopsis'])
  #publisher_filter = LanguagePreferredPublisher(df)
  #publishers_for_french = publisher_filter.get_publishers_by_language('fr')

In [96]:
# Model dataframe
df_model = df_match.copy()
df_model.drop(['publisher', 'language'], axis=1, inplace=True)

In [97]:
# Text Preprocessing
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
import sys
!{sys.executable} -m pip install contractions
import contractions

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




In [98]:
def preprocess(s):
    s=str(s)
    s = s.lower() # lowercase
    s = contractions.fix(s) # expand contractions
    s = re.sub(r'\\n', ' ', s) # remove \n
    s = re.sub(r'http\S+', '', s) # remove url
    s = re.sub(r'<.*?>', '', s) # remove html
    s = re.sub(r'\d+', '', s) # remove numbers
    s = re.sub(r'[^\w\s]', ' ', s) # remove punctuation and special characters
    s = word_tokenize(s) # tokenize
    s = [w for w in s if w not in set(stopwords.words('english'))] # stop words
    s = [stemmer.stem(w) for w in s] # stemming
    return " ".join(s) # white spaces

In [99]:
df_model=df_model.map(lambda s:preprocess(s)).copy()

In [100]:
# TF IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=.99)

In [101]:
# merged TF IDF
df_model['merged'] = df_model['title'] + ' ' + df_model['subjects'] + ' ' + df_model['synopsis']
tfidf.fit(df_model['merged'])

In [102]:
# column weights
title_weight=.05
subject_weight=.8
synopsis_weight=.15

In [103]:
# Apply TF-IDF to each column
def tfidfer(df_col):
  matrix = tfidf.transform(df_col)
  return pd.DataFrame(matrix.toarray(), columns=tfidf.get_feature_names_out(), index = df_model.index)
title_tfidf = title_weight*tfidfer(df_model['title'])
subject_tfidf = subject_weight*tfidfer(df_model['subjects'])
synopsis_tfidf = synopsis_weight*tfidfer(df_model['synopsis'])
tfidf_df = pd.concat([title_tfidf,subject_tfidf,synopsis_tfidf], axis=1)

In [104]:
vector_df = tfidf_df.copy()

In [105]:
# Scaling
from sklearn.preprocessing import MaxAbsScaler
max_scaler = MaxAbsScaler()
max_df = max_scaler.fit(vector_df)
max_df = pd.DataFrame(max_scaler.transform(vector_df), columns=vector_df.columns, index=vector_df.index)

In [106]:
scaled_df = max_df.copy()

In [107]:
# SVD dimension reduction
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, algorithm='arpack')
svd.fit(scaled_df)

In [108]:
svd_df = pd.DataFrame(svd.transform(scaled_df), index=vector_df.index)

In [109]:
reduced_df=svd_df.copy()

In [110]:
# Birch clustering
from sklearn.cluster import Birch

In [163]:
birch_model = Birch(threshold=0.25, n_clusters=75)

In [164]:
cmodel = birch_model

In [165]:
fit = cmodel.fit(reduced_df)
labels = fit.labels_
labels

array([ 4, 68, 15, ..., 67, 71, 56])

In [195]:
df_model['label'] = labels # get labels

In [167]:
predictions = cmodel.predict(reduced_df)

In [119]:
# Clean and vectorize input
title_input = "Changó, el gran putas"
subject_input = "Fiction Novels Spanish Literature"
synopsis_input = "Manuel Zapata Olivella nació en Lorica, Córdoba, 1920, y murió en Bogotá en 2004. Muy niño llegó a Cartagena. Fue médico, antropólogo, folclorista y escritor. En los años sesenta y setenta dirigió la revista Letras Nacionales. Durante veinte años investigó para su novela central Changó, el gran putas, cuya solución poética encontró luego de pasar una noche desnudo en una de las oscuras y sofocantes bóvedas de la fortaleza de la isla Goré, prisión de Senegal en la cual eran recluidos los africanos cazados, antes de su traslado en barcos al Nuevo Mundo.  Esta novela y gran epopeya es un inmenso fresco que cubre quinientos años de historia, para la cual Zapata recurrió a lo que denominó realismo mítico. Da cuenta de los dioses tutelares y cosmovisión de la religión yoruba, incorpora proverbios, trabalenguas, cuentos de hadas y canciones de la tradición africana. Recorre las hazañas de los héroes negros en las revoluciones americanas. Zapata demuestra que los negros nunca impusieron nada a nadie, más bien contagiaron su baile, sensualidad, comida, lenguaje.  El profesor Darío Henao Restrepo, en el prólogo, explica la concepción subyacente a esta obra: El principio filosófico del muntu, que rige su elaboración poética, implica una connotación del hombre que incluye a los vivos y difuentos, así como animales, vegetales, minerales y cosas que le sirven. Se trata de una fuerza espiritual que une en un solo nudo al hombre con su ascendencia y descendencia, inmersos en el universo presente, pasado y futuro."
detector.set_text(synopsis_input)
detect_input = detector.detect_language()
if detect_input!='en':
  synopsis_input = translate_to_english(synopsis_input)
lang_input = detect_input # detect language, or selected language

In [120]:
def preprocess_transform(s):
  preprocessed = preprocess(s)
  transformed = tfidf.transform([preprocessed]).toarray()
  return transformed

In [121]:
clean_title_input = title_weight*preprocess_transform(title_input)
clean_subject_input = subject_weight*preprocess_transform(subject_input)
clean_synopsis_input = synopsis_weight*preprocess_transform(synopsis_input)
input_df = pd.DataFrame(np.concatenate([clean_title_input, clean_subject_input, clean_synopsis_input], axis=1))
input_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38108,38109,38110,38111,38112,38113,38114,38115,38116,38117
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [122]:
# scale input
input_df = max_scaler.transform(input_df)



In [123]:
# reduce dimensions of input
svd_input = svd.transform(input_df)



In [124]:
red_input = svd_input

In [125]:
# predict test input
input_pred = loaded_model.predict(red_input)[0]

57

In [180]:
# vector subset where label matches
subset = df_model[df_model['label'] == input_pred]

In [181]:
# get subset
sim_df = reduced_df.copy()
sim_subset = sim_df.loc[subset.index]

In [182]:
# calculate cosine similarity within that cluster
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
simscore = cosine_similarity
similarity = simscore(red_input, sim_subset).T #input
similarity

array([[0.60186061],
       [0.61948364],
       [0.83731933],
       [0.90354853],
       [0.63898383],
       [0.84223134],
       [0.60961041],
       [0.82195159],
       [0.85957658],
       [0.86004361],
       [0.75445518],
       [0.87181897],
       [0.70108879],
       [0.8280588 ],
       [0.59184307],
       [0.69854145],
       [0.73214878],
       [0.7754082 ],
       [0.79889669],
       [0.80042097],
       [0.7133256 ],
       [0.67326644],
       [0.6996498 ],
       [0.90468258],
       [0.58304482],
       [0.83231561],
       [0.84386228],
       [0.70960232],
       [0.86693958],
       [0.51083797],
       [0.82608431],
       [0.53053202],
       [0.72785904],
       [0.74185257],
       [0.87976517],
       [0.71136139],
       [0.61405027],
       [0.80674986],
       [0.83817818],
       [0.64267066],
       [0.80234558],
       [0.83539664],
       [0.74997067],
       [0.69744604],
       [0.84646792],
       [0.85538655],
       [0.79370404],
       [0.806

In [227]:
# append similarity to dataframe
subset['similarity'] = similarity

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['similarity'] = similarity


In [234]:
subset.loc[df_match['language'] == lang_input, 'similarity']+=1 # prioritize same language

In [235]:
# top 5 similar
top_subset = subset.sort_values(by='similarity', ascending=False).head(5)
top_subset

Unnamed: 0,title,subjects,synopsis,merged,label,similarity
3606,caribbean lipstick spanish edit,literatur fiction genr fiction contemporari,manuel de la fuent cuban origin emigr thrive v...,caribbean lipstick spanish edit literatur fict...,57,2.803664
5619,la isla de las mil fuent seri del carib jamaic...,literatur fiction genr fiction famili saga his...,author white cloud trilog come grip evoc new f...,la isla de las mil fuent seri del carib jamaic...,57,2.787571
932,caribbean passag stori young woman surviv,literatur fiction contemporari,set british guiana trinidad caribbean passag t...,caribbean passag stori young woman surviv lite...,57,0.904683
311,caribbean caribbean whatev,literatur fiction contemporari,scarier humong humpback nuzzl hull hurrican ma...,caribbean caribbean whatev literatur fiction c...,57,0.903549
3918,tree life novel caribbean,literatur fiction contemporari,imposs read novel come away sadder exhilar und...,tree life novel caribbean literatur fiction co...,57,0.898829


In [185]:
top_index = top_subset.index
top_index

Index([932, 311, 3918, 5938, 3098], dtype='int64')

In [186]:
# get information of top 5
top_titles = []
top_subjects = []
top_synopsis = []
top_publishers = []
top_languages = []

In [133]:
def book_info(index):
  title = df_match.loc[index]['title']
  subjects = df_match.loc[index]['subjects']
  synopsis = df_match.loc[index]['synopsis']
  publisher = df_match.loc[index]['publisher']
  language = df_match.loc[index]['language']
  return title, subjects, synopsis, publisher, language

In [209]:
for i in range(len(top_index)):
  top_titles.append(book_info(top_index[i])[0])
  top_subjects.append(book_info(top_index[i])[1])
  top_synopsis.append(book_info(top_index[i])[2])
  top_publishers.append(book_info(top_index[i])[3])
  top_languages.append(book_info(top_index[i])[4])
  print(book_info(top_index[i])[0])

Caribbean Passage: The Story of a Young Woman's Will to Survive
Caribbean or Caribbean . . . Whatever
Tree of Life: A Novel of the Caribbean
The Caribbean Killers
Pearl: A Caribbean Story


In [135]:
# GROQ API to prompt LLAMA
!pip install groq



In [136]:
%env GROQ_API_KEY=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

env: GROQ_API_KEY=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


In [137]:
from groq import Groq

In [138]:
client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

In [210]:
chat_completion = client.chat.completions.create(
    #
    # Required parameters
    #
    messages=[
        # Set an optional system message. This sets the behavior of the
        # assistant and can be used to provide specific instructions for
        # how it should behave throughout the conversation.
        {
            "role": "system",
            "content": f"You are a knowledgable and straightforward assistant with experience in literature and literary analysis, and knowledge of the publishing industry. You are explaining to authors which books are most similar to theirs using criteria such as themes, plot, character, setting, and tone. Your responses are concise and academic, strictly providing lists and explanations. Structure the response in a list, where each entry is formatted, with elaboration as necessary. (Do not mention the books' alphabetical labels A B C D E) : \n (Rank number). (Publisher) \n Published in (Language)\n Explanation: (Publisher) published (Title), a (Subject) book that is similar to your book because (Explanation, 3 sentence justification of the book similarities). Therefore, your book appeals to (Publisher)'s sector of the market."
        },
        # Set a user message for the assistant to respond to.
        {
            "role": "user",
            "content": f"Using the input book information as a reference, compare and rank the similarity of books A, B, C, D, and E to the reference. Use literary analysis to evaluate each book based on thematic alignment, such as central conflict, character focus, and setting, and stylistic features such as narrative tone and diction. \n Provide a list of the respective publishers of the books in descending order of book similarity to the reference book with a 3 sentence justification of the book similarities and ranking, citing specific aspects of the book information and synopsis that supports your assessment. Structure the response in a list, where each entry is formatted, with elaboration as necessary. (Do not mention the books' alphabetical labels A B C D E) : \n (Rank number). (Publisher) \n Published in (Language)\n Explanation: (Publisher) published (Title), a (Subject) book that is similar to your book because (Explanation, 3 sentence justification of the book similarities). Therefore, your book appeals to (Publisher)'s sector of the market. \n \n Book Reference: \n Title: {title_input}. \n Subject: {subject_input} \n Synopsis: {synopsis_input} \n \n Book A: \n Title: {top_titles[0]} \n Subject: {top_subjects[0]} \n Synopsis: {top_synopsis[0]} \n Publisher: {top_publishers[0]} \n Language: {top_languages[0]} \n \n Book B: \n Title: {top_titles[1]} \n Subject: {top_subjects[1]} \n Synopsis: {top_synopsis[1]} \n Publisher: {top_publishers[1]} \n Language: {top_languages[1]} \n \n Book C: \n Title: {top_titles[2]} \n Subject: {top_subjects[2]} \n Synopsis: {top_synopsis[2]} \n Publisher: {top_publishers[2]} \n Language: {top_languages[2]} \n \n Book D: \n Title: {top_titles[3]} \n Subject: {top_subjects[3]} \n Synopsis: {top_synopsis[3]} \n Publisher: {top_publishers[3]} \n Language: {top_languages[3]} \n \n Book E: \n Title: {top_titles[4]} \n Subject: {top_subjects[4]} \n Synopsis: {top_synopsis[4]} \n Publisher: {top_publishers[4]} \n Language: {top_languages[4]}"

        }

    ],

    # The language model which will generate the completion.
    model= "llama3-70b-8192",#"llama3-70b-8192", "llama3-8b-8192", "llama-3.1-70b-versatile", "llama-3.1-8b-instant"

    #
    # Optional parameters
    #

    # Controls randomness: lowering results in less random completions.
    # As the temperature approaches zero, the model will become deterministic
    # and repetitive.
    temperature=0.3,

    # The maximum number of tokens to generate. Requests can use up to
    # 32,768 tokens shared between prompt and completion.
    max_tokens=8000,

    # Controls diversity via nucleus sampling: 0.5 means half of all
    # likelihood-weighted options are considered.
    top_p=1,

    # A stop sequence is a predefined or user-specified text string that
    # signals an AI to stop generating content, ensuring its responses
    # remain focused and concise. Examples include punctuation marks and
    # markers like "[end]".
    stop=None,

    # If set, partial message deltas will be sent.
    stream=False,
)

In [211]:
# Print the completion returned by the LLM.
output = chat_completion.choices[0].message.content

In [212]:
output = GoogleTranslator(source='auto', target=lang_input).translate(output)

In [213]:
print(output)

Basándonos en la información del libro proporcionada, aquí está la lista de similitudes del libro con el libro de referencia "Changó, el gran putas" de Manuel Zapata Olivella:

1. Ballantine Books
Publicado en inglés
Explicación: Ballantine Books publicó "Tree of Life: A Novel of the Caribbean", un libro de ficción contemporánea que es similar a "Changó, el gran putas" porque explora el rico patrimonio cultural del Caribe, profundizando en la historia y las tradiciones de la región. El enfoque del libro en la diáspora caribeña y las luchas de su gente resuena con los temas de "Changó, el gran putas", que también explora las experiencias de los esclavos africanos y sus descendientes en el Caribe. Por lo tanto, su libro atrae al sector del mercado de Ballantine Books, que se centra en la ficción contemporánea y las obras literarias que exploran temas de identidad, cultura y justicia social.

2. AuthorHouse
Publicado en inglés
Explicación: AuthorHouse publicó "Caribbean Passage: The Story