In [1]:
import pandas as pd

# Supongamos que tu archivo CSV se llama 'libros.csv'
df = pd.read_csv('C:\\Users\\Fukushima\\Documents\\GitHub\\ProyectoFinalVV\\data\\raw\\librosyafiltrados.csv')

In [16]:
df.columns

Index(['Name', 'pagesNumber', 'RatingDist4', 'RatingDistTotal', 'Authors',
       'Rating', 'RatingDist5', 'RatingDist3', 'is_collection',
       'book_category', 'Name_encoded', 'Authors_encoded',
       'RatingDistTotal_numeric'],
      dtype='object')

LA diferencia entre este código y el anterior es que en este modelo estoy restringiendo mucho más las varaibles para así poder dar una recomendación más exacta ya que el modelo anterior me proporcionaba recomendaciones similares en libros más populares como en Harry Potter o en el conde de MonteCristo. El modelo actual es mucho más preciso, a pesar de que en Harry Potter me sigue recomendando libros de la misma saga, al menos ya no son los mismos pero en diferentes idiomas.

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy.sparse import hstack

features = df[['Rating', 'pagesNumber', 'Authors']]

# Normalizar las columnas numéricas (Rating y pagesNumber)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features[['Rating', 'pagesNumber']])

# Realizar one-hot encoding para la columna 'Authors'
encoder = OneHotEncoder()
authors_encoded = encoder.fit_transform(features[['Authors']])

# Combinar las características escaladas y el encoding de autores en una matriz dispersa
matrix_features = hstack([features_scaled, authors_encoded])

# Asegurarse de que matrix_features sea una matriz bidimensional
print(matrix_features.shape) 

(1060441, 362084)


In [3]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# Seleccionar columnas numéricas relevantes
numerical_columns = ['pagesNumber', 'Rating', 'RatingDistTotal_numeric']  

if all(col in df.columns for col in numerical_columns):
    numerical_features = df[numerical_columns].fillna(0).values  # Asegurarse de que no haya NaNs
    
    # Escalar los valores numéricos para evitar que un rango domine a los otros
    scaler = StandardScaler()
    numerical_features = scaler.fit_transform(numerical_features)
else:
    print("Algunas columnas numéricas faltan en el DataFrame.")
    numerical_features = np.array([])

In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from difflib import SequenceMatcher

# Lista de palabras clave para detectar versiones alternativas
keywords = ["guide", "version", "edition", "abridged", "unabridged", "adaptation", "annotated", "bloom's", "summary"]

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def has_common_keywords(title1, title2):
    title1 = title1.lower()
    title2 = title2.lower()
    
    for keyword in keywords:
        if keyword in title1 and keyword in title2:
            return True
    return False

def get_title_similarity(input_title, all_titles):
    similarities = [similar(input_title, title) for title in all_titles]
    return np.array(similarities)

def get_book_recommendations(input_book, num_recommendations=20):
    try:
        idx = df[df['Name'] == input_book].index[0]
    except IndexError:
        print(f"Libro '{input_book}' no encontrado en el DataFrame.")
        return pd.DataFrame(), []
    
    input_title = df['Name'].iloc[idx]
    title_similarities = get_title_similarity(input_title, df['Name'].values)
    
    input_features = numerical_features[idx].reshape(1, -1)
    numerical_similarities = cosine_similarity(input_features, numerical_features)[0]

    total_similarity = (0.7 * title_similarities) + (0.3 * numerical_similarities)
    
    sim_scores = list(enumerate(total_similarity))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:]  # Excluir el libro de entrada

    filtered_scores = []
    for i, score in sim_scores:
        candidate_title = df['Name'].iloc[i]
        base_title = candidate_title.split(":")[0].strip().lower()
        if similar(base_title, input_title.lower()) < 0.8 and not has_common_keywords(input_title, candidate_title):
            if score < 0.7:  # Filtrar por umbral de similitud específico
                filtered_scores.append((i, score))
                if len(filtered_scores) == num_recommendations:
                    break
    
    book_indices = [i[0] for i in filtered_scores]
    recommendations = df[['Name', 'Authors']].iloc[book_indices]
    return recommendations, filtered_scores

def remove_duplicate_titles_global(recommendations):
    seen_titles = set()
    unique_recommendations = []
    scores_dict = {}
    
    for _, row in recommendations.iterrows():
        title = row['Name']
        base_title = title.split(":")[0].strip().lower()
        if base_title not in seen_titles:
            seen_titles.add(base_title)
            unique_recommendations.append(row)
            # Save the highest similarity score for each unique title
            index = recommendations.index.get_loc(row.name)
            scores_dict[base_title] = scores[index][1]
    
    # Convert the list to DataFrame
    unique_recommendations_df = pd.DataFrame(unique_recommendations)
    return unique_recommendations_df, scores_dict



In [5]:
input_book = "Harry Potter and the Prisoner of Azkaban"
recommendations, scores = get_book_recommendations(input_book)

# Filtrar duplicados globalmente
unique_recommendations, scores_dict = remove_duplicate_titles_global(recommendations)

print("Book recommendations for '{}':".format(input_book))
for _, row in unique_recommendations.iterrows():
    base_title = row['Name'].split(":")[0].strip().lower()
    print("- Book: {}, Author: {}, Similarity Score: {:.4f}".format(row['Name'], row['Authors'], scores_dict.get(base_title, 0)))

Book recommendations for 'Harry Potter and the Prisoner of Azkaban':
- Book: Harry Potter and the Chamber of Secrets (Harry Potter, #2), Author: J.K. Rowling, Similarity Score: 0.7000
- Book: Harri Potter maen yr Athronydd, Author: J.K. Rowling, Similarity Score: 0.6999
- Book: Harry Potter und die Heiligtümer des Todes, Author: J.K. Rowling, Similarity Score: 0.6927
- Book: Harry Potter e la Pietra Filosofale, Author: J.K. Rowling, Similarity Score: 0.6919
- Book: Harry Potter and the Deathly Hallows (Harry Potter, #7), Author: J.K. Rowling, Similarity Score: 0.6832


In [6]:
input_book = "The Hound of the Baskervilles"
recommendations, scores = get_book_recommendations(input_book)

# Filtrar duplicados globalmente
unique_recommendations, scores_dict = remove_duplicate_titles_global(recommendations)

print("Book recommendations for '{}':".format(input_book))
for _, row in unique_recommendations.iterrows():
    base_title = row['Name'].split(":")[0].strip().lower()
    print("- Book: {}, Author: {}, Similarity Score: {:.4f}".format(row['Name'], row['Authors'], scores_dict.get(base_title, 0)))


Book recommendations for 'The Hound of the Baskervilles':
- Book: The Wind In The Willows, Author: Kenneth Grahame, Similarity Score: 0.7000
- Book: The Name of the Rose, Author: Umberto Eco, Similarity Score: 0.6996
- Book: The Day of the Jackal, Author: Frederick Forsyth, Similarity Score: 0.6992
- Book: The Gift of the Magi, Author: O. Henry, Similarity Score: 0.6978
- Book: The Hunchback of Notre Dame, Author: Victor Hugo, Similarity Score: 0.6971


In [7]:
input_book = "The Count of Monte Cristo"
recommendations, scores = get_book_recommendations(input_book)

# Filtrar duplicados globalmente
unique_recommendations, scores_dict = remove_duplicate_titles_global(recommendations)

print("Book recommendations for '{}':".format(input_book))
for _, row in unique_recommendations.iterrows():
    base_title = row['Name'].split(":")[0].strip().lower()
    print("- Book: {}, Author: {}, Similarity Score: {:.4f}".format(row['Name'], row['Authors'], scores_dict.get(base_title, 0)))

Book recommendations for 'The Count of Monte Cristo':
- Book: The Call of the Wild, Author: Jack London, Similarity Score: 0.6984
- Book: The tale of Peter Rabbit, Author: Beatrix Potter, Similarity Score: 0.6984
- Book: The House Of The Spirits, Author: Isabel Allende, Similarity Score: 0.6961
- Book: The Phantom of the Opera, Author: Gaston Leroux, Similarity Score: 0.6954


In [8]:
input_book = "The Catcher in the Rye"
recommendations, scores = get_book_recommendations(input_book)

# Filtrar duplicados globalmente
unique_recommendations, scores_dict = remove_duplicate_titles_global(recommendations)

print("Book recommendations for '{}':".format(input_book))
for _, row in unique_recommendations.iterrows():
    base_title = row['Name'].split(":")[0].strip().lower()
    print("- Book: {}, Author: {}, Similarity Score: {:.4f}".format(row['Name'], row['Authors'], scores_dict.get(base_title, 0)))

Book recommendations for 'The Catcher in the Rye':
- Book: The Devil in the White City, Author: Erik Larson, Similarity Score: 0.7000
- Book: The Power and the Glory, Author: Graham Greene, Similarity Score: 0.6987
- Book: The Call of the Wild, Author: Jack London, Similarity Score: 0.6982
- Book: The Phantom of the Opera, Author: Gaston Leroux, Similarity Score: 0.6946


In [9]:
input_book = "The Hobbit"
recommendations, scores = get_book_recommendations(input_book)

# Filtrar duplicados globalmente
unique_recommendations, scores_dict = remove_duplicate_titles_global(recommendations)

print("Book recommendations for '{}':".format(input_book))
for _, row in unique_recommendations.iterrows():
    base_title = row['Name'].split(":")[0].strip().lower()
    print("- Book: {}, Author: {}, Similarity Score: {:.4f}".format(row['Name'], row['Authors'], scores_dict.get(base_title, 0)))

Book recommendations for 'The Hobbit':
- Book: The Witches, Author: Roald Dahl, Similarity Score: 0.6990
- Book: The Prophet, Author: Kahlil Gibran, Similarity Score: 0.6965
- Book: Bilbo le hobbit, Author: J.R.R. Tolkien, Similarity Score: 0.6920
- Book: The Help, Author: Kathryn Stockett, Similarity Score: 0.6888
- Book: The Firm, Author: Robin Waterfield, Similarity Score: 0.6887
- Book: The Road, Author: Cormac McCarthy, Similarity Score: 0.6885


In [10]:
input_book = "1984"
recommendations, scores = get_book_recommendations(input_book)

# Filtrar duplicados globalmente
unique_recommendations, scores_dict = remove_duplicate_titles_global(recommendations)

print("Book recommendations for '{}':".format(input_book))
for _, row in unique_recommendations.iterrows():
    base_title = row['Name'].split(":")[0].strip().lower()
    print("- Book: {}, Author: {}, Similarity Score: {:.4f}".format(row['Name'], row['Authors'], scores_dict.get(base_title, 0)))

Book recommendations for '1984':
- Book: Animal Farm / 1984, Author: George Orwell, Similarity Score: 0.5365
- Book: 1974, Author: David Peace, Similarity Score: 0.5171
- Book: 1945, Author: Newt Gingrich, Similarity Score: 0.5078
- Book: 1934, Author: Alberto Moravia, Similarity Score: 0.5047
- Book: 1988, Author: Andrew McGahan, Similarity Score: 0.5026
- Book: Fever 1793, Author: Laurie Halse Anderson, Similarity Score: 0.4833
- Book: 1942, Author: Robert Conroy, Similarity Score: 0.4833
- Book: 1776, Author: David McCullough, Similarity Score: 0.4749
- Book: 1968, Author: Ed Sanders, Similarity Score: 0.4724
- Book: Fiebre 1793, Author: Laurie Halse Anderson, Similarity Score: 0.4698


In [11]:
input_book = "Pride and Prejudice"
recommendations, scores = get_book_recommendations(input_book)

# Filtrar duplicados globalmente
unique_recommendations, scores_dict = remove_duplicate_titles_global(recommendations)

print("Book recommendations for '{}':".format(input_book))
for _, row in unique_recommendations.iterrows():
    base_title = row['Name'].split(":")[0].strip().lower()
    print("- Book: {}, Author: {}, Similarity Score: {:.4f}".format(row['Name'], row['Authors'], scores_dict.get(base_title, 0)))


Book recommendations for 'Pride and Prejudice':
- Book: Le Poids De La Preuve, Author: Scott Turow, Similarity Score: 0.6989
- Book: War And Peace, Author: Leo Tolstoy, Similarity Score: 0.6932
- Book: Wall and Piece, Author: Banksy, Similarity Score: 0.6711
- Book: Orgullo y Prejuicio, Author: Jane Austen, Similarity Score: 0.6684
- Book: Crime and Punishment, Author: Fyodor Dostoyevsky, Similarity Score: 0.6586
- Book: Prince and the Pauper, Author: Mark Twain, Similarity Score: 0.6541
- Book: El Principe / the Prince, Author: Niccolò Machiavelli, Similarity Score: 0.6495
- Book: Rue de La Sardine, Author: John Steinbeck, Similarity Score: 0.6495


In [19]:
import pandas as pd

# Cargar el dataset
df = pd.read_csv('C:\\Users\\Fukushima\\Documents\\GitHub\\ProyectoFinalVV\\data\\raw\\librosyafiltrados.csv')

# Definir X y y
X = df[['Name_encoded', 'Authors_encoded', 'pagesNumber', 'RatingDistTotal_numeric']]
y = df['Rating']


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import joblib

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar el modelo
modelo = LinearRegression()
modelo.fit(X_train, y_train)

# Evaluar el modelo
y_pred = modelo.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

# Guardar el modelo entrenado
joblib.dump(modelo, 'modelo_libros_entrenado.pkl')


RMSE: 0.05183820679448788


['modelo_libros_entrenado.pkl']

In [22]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Supongamos que 'y_pred' son las predicciones de tu modelo y 'y_test' son los valores reales
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")


RMSE: 0.05183820679448788


La métrica de accuracy se usa generalmente para problemas de clasificación, no para problemas de regresión. Dado que estás trabajando con un modelo de regresión lineal, Rating es una variable continua, no una categórica. Por lo tanto, no puedo usar accuracy para evaluar mi modelo de regresión.

In [26]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calcular las métricas
y_pred = modelo.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Imprimir las métricas
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R²: {r2}")


RMSE: 0.05183820679448788
MAE: 0.013889439572962027
MSE: 0.0026871996836680895
R²: 0.9819942434252295


- RMSE (Root Mean Square Error): 0.0518 El error promedio en la misma escala que la variable objetivo es bajo, indicando que las predicciones están cerca de los valores reales.

- MAE (Mean Absolute Error): 0.0139 El error absoluto promedio es también bajo, sugiriendo que en promedio, las predicciones están muy cerca de los valores reales.

- MSE (Mean Squared Error): 0.0027 El error cuadrático medio es pequeño, lo que indica que el modelo está haciendo buenas predicciones sin grandes errores.

- R² (Coeficiente de Determinación): 0.982 Un R² cercano a 1 significa que el modelo explica casi el 98.2% de la variación en la variable objetivo, lo que es excelente.