In [41]:
import pandas as pd
import pyodbc

# Connect to SQL Server
conn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER=Sudhakar\\SQLEXPRESS01;DATABASE=Local_database;UID=sa;PWD=123;Timeout=60')

# query
query = """
SELECT 
    a.book_id, 
    a.title, 
    a.isbn13, 
    a.publication_date, 
    b.language_name,    
    c.publisher_name,   
    d.customer_id,
    e.order_id,    
	h.author_name    
FROM 
    book a
JOIN 
    book_language b ON a.language_id = b.language_id
JOIN 
    publisher c  ON a.publisher_id = c.publisher_id
JOIN 
    order_line f ON a.book_id = f.book_id        
JOIN 
    cust_order e ON e.order_id = f.order_id      
JOIN 
    customer d ON d.customer_id = e.customer_id
JOIN
	book_author g ON a.book_id = g.book_id
JOIN 
	author h on g.author_id = h.author_id order by a.book_id asc;
"""

# Execute 
result = pd.read_sql_query(query, conn)
books_df = pd.DataFrame(result)
# Close connection
conn.close()

books_df


  result = pd.read_sql_query(query, conn)


Unnamed: 0,book_id,title,isbn13,publication_date,language_name,publisher_name,customer_id,order_id,author_name
0,1,The World's First Love: Mary Mother of God,8987059752,1996-09-01,United States English,Ignatius Press,408,816,Fulton J. Sheen
1,2,The Illuminati,20049130001,2004-10-04,English,Thomas Nelson,532,1064,Larry Burkett
2,5,Cliffs Notes on Aristophanes' Lysistrata The ...,49086007763,1983-12-29,English,Cliffs Notes,382,763,W. John Campbell
3,5,Cliffs Notes on Aristophanes' Lysistrata The ...,49086007763,1983-12-29,English,Cliffs Notes,505,3510,W. John Campbell
4,6,Life Is a Dream and Other Spanish Classics (Er...,73999140774,2000-04-01,English,Applause Theatre & Cinema Book Publishers,386,772,Eric Bentley
...,...,...,...,...,...,...,...,...,...
23417,11123,Asfixia,9789875661707,2006-09-01,English,Debolsillo,687,1373,Chuck Palahniuk
23418,11123,Asfixia,9789875661707,2006-09-01,English,Debolsillo,11,7121,Chuck Palahniuk
23419,11124,El Dia Que Nietzsche Lloró,9789875801448,2006-10-24,Spanish,Planeta,111,221,Irvin D. Yalom
23420,11124,El Dia Que Nietzsche Lloró,9789875801448,2006-10-24,Spanish,Planeta,69,138,Irvin D. Yalom


In [42]:
books_df.columns

Index(['book_id', 'title', 'isbn13', 'publication_date', 'language_name',
       'publisher_name', 'customer_id', 'order_id', 'author_name'],
      dtype='object')

In [43]:
books_df.reset_index(inplace=True)

In [44]:
# convert date time formate
books_df["publication_date"] = pd.to_datetime(books_df["publication_date"])
books_df["published_year"] = books_df["publication_date"].dt.year

In [45]:
# drop publication_date Column
books_df.drop("publication_date",axis=1,inplace=True)

In [46]:
books_df.head()

Unnamed: 0,index,book_id,title,isbn13,language_name,publisher_name,customer_id,order_id,author_name,published_year
0,0,1,The World's First Love: Mary Mother of God,8987059752,United States English,Ignatius Press,408,816,Fulton J. Sheen,1996
1,1,2,The Illuminati,20049130001,English,Thomas Nelson,532,1064,Larry Burkett,2004
2,2,5,Cliffs Notes on Aristophanes' Lysistrata The ...,49086007763,English,Cliffs Notes,382,763,W. John Campbell,1983
3,3,5,Cliffs Notes on Aristophanes' Lysistrata The ...,49086007763,English,Cliffs Notes,505,3510,W. John Campbell,1983
4,4,6,Life Is a Dream and Other Spanish Classics (Er...,73999140774,English,Applause Theatre & Cinema Book Publishers,386,772,Eric Bentley,2000


In [47]:
books_df.isnull().sum()

index             0
book_id           0
title             0
isbn13            0
language_name     0
publisher_name    0
customer_id       0
order_id          0
author_name       0
published_year    0
dtype: int64

In [48]:
books_df.columns

Index(['index', 'book_id', 'title', 'isbn13', 'language_name',
       'publisher_name', 'customer_id', 'order_id', 'author_name',
       'published_year'],
      dtype='object')

# **Genres**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Combine metadata
books_df['combined_metadata'] = books_df['title'] + " " + books_df['author_name'] + " " + books_df['publisher_name']

# Vectorize text
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(books_df['combined_metadata'])

# Apply KMeans clustering
kmeans = KMeans(n_clusters=10, random_state=42)  
books_df['genre_cluster'] = kmeans.fit_predict(X)

# Inspect clusters to infer genres
for cluster_id in range(10):
    print(f"Cluster {cluster_id}:")
    print(books_df[books_df['genre_cluster'] == cluster_id]['title'].head(10))


In [50]:
def infer_genre(metadata):
    metadata = metadata.lower()

    # Expanded and more detailed keywords for each genre
    genre_keywords = {
        "Romance": [
            "romance", "love", "affair", "relationship", "valentine", "passion", "heart", "emotion", "couple", 
            "romantic", "love story", "wedding", "marriage", "dating", "charming", "kiss", "intimacy", "desire"
        ],
        "Science Fiction": [
            "sci-fi", "science fiction", "space", "alien", "robot", "futuristic", "time travel", "space travel", 
            "virtual reality", "dystopia", "cyberpunk", "technology", "artificial intelligence", "quantum", "parallel universe"
        ],
        "Fantasy": [
            "fantasy", "magic", "wizard", "dragon", "myth", "sorcery", "elves", "fairy", "mystical", "creature", 
            "supernatural", "kingdom", "quest", "enchanted", "witch", "spell", "fairy tale", "hero"
        ],
        "Mystery/Thriller": [
            "mystery", "thriller", "detective", "crime", "investigation", "murder", "suspense", "secret", "conspiracy", 
            "puzzle", "detective story", "whodunit", "investigative", "chase", "clue", "intrigue", "plot twist", "tension"
        ],
        "Historical": [
            "history", "historical", "past", "war", "empire", "ancient", "renaissance", "civil war", "world war", 
            "medieval", "victorian", "classical", "legend", "conquest", "revolution", "colonial", "historical fiction"
        ],
        "Horror": [
            "horror", "ghost", "supernatural", "haunted", "fear", "zombie", "vampire", "monster", "creature", 
            "dark", "nightmare", "terror", "paranormal", "occult", "haunting", "dread", "blood", "gore", "spooky"
        ],
        "Biography": [
            "biography", "memoir", "life story", "autobiography", "personal", "inspiration", "life history", "success", 
            "legacy", "true story", "journey", "story of", "self-made", "entrepreneur", "famous person", "real life"
        ],
        "Self-Help": [
            "self-help", "motivation", "personal development", "improvement", "success", "habit", "mindset", 
            "empowerment", "life coaching", "productivity", "positive thinking", "confidence", "leadership", 
            "inspiration", "well-being", "personal growth", "mental health"
        ],
        "Children's Literature": [
            "children", "kids", "juvenile", "fairy tale", "nursery", "storybook", "picture book", "bedtime story", 
            "family", "play", "imagination", "fun", "adventure", "learning", "animals", "magical", "cartoon"
        ],
        "Adventure": [
            "adventure", "exploration", "journey", "expedition", "quest", "survival", "adventure story", "discovery", 
            "wild", "trek", "expedition", "travel", "outdoors", "danger", "brave", "heroic", "action"
        ],
        "Non-Fiction": [
            "non-fiction", "true story", "real life", "documentary", "facts", "reality", "history", "actual", "case study", 
            "biography", "memoir", "essays", "research", "journalism", "report", "analysis", "opinion", "reference"
        ],
        "Fiction": [
            "fiction", "novel", "story", "literature", "imagination", "creative writing", "narrative", "characters", 
            "plot", "setting", "dramatic", "prose", "novelistic", "fictional", "adventure", "fantasy"
        ],
        "Poetry": [
            "poetry", "poem", "verse", "lyric", "haiku", "ballad", "ode", "rhyme", "meter", "stanza", "poetic", 
            "rhyme scheme", "prose poetry", "lyrical", "sonnets", "verses"
        ],
        "Young Adult": [
            "young adult", "teen", "high school", "coming of age", "adolescence", "youth", "teenage", "teen fiction", 
            "teen romance", "teen drama", "puberty", "teen adventure", "coming-of-age", "self-discovery"
        ],
        "Spiritual/Religion": [
            "spiritual", "religion", "faith", "philosophy", "bible", "belief", "god", "heaven", "soul", "meditation", 
            "prayer", "religious", "divine", "enlightenment", "spirituality", "faith-based", "scripture"
        ],
        "Comedy/Humor": [
            "comedy", "humor", "funny", "satire", "parody", "joke", "laugh", "comedic", "ridiculous", "stand-up", 
            "funny story", "jokes", "laughter", "funny book", "hilarious", "absurd"
        ],
        "Business/Economics": [
            "business", "economics", "management", "finance", "entrepreneurship", "leadership", "marketing", 
            "strategy", "economy", "corporate", "startup", "success", "investment", "financial", "stocks", 
            "financial planning", "business growth"
        ],
        "Technology": [
            "technology", "programming", "software", "ai", "artificial intelligence", "computing", "robotics", 
            "machine learning", "coding", "development", "innovation", "tech", "gadgets", "engineering", "internet", 
            "hardware"
        ],
        "Education/Academic": [
            "education", "academic", "study", "research", "textbook", "learning", "teaching", "school", "university", 
            "research paper", "scholar", "curriculum", "theory", "classroom", "academic writing", "education system"
        ],
        "Health/Fitness": [
            "health", "fitness", "wellness", "diet", "exercise", "nutrition", "mental health", "workout", "body", 
            "strength", "cardio", "weight loss", "healthy lifestyle", "well-being", "meditation", "wellness journey"
        ]
    }

    # Search for keywords in metadata
    for genre, keywords in genre_keywords.items():
        for keyword in keywords:
            if keyword in metadata:
                return genre

    # Default genre if no match is found
    return "General"


# Apply the function to the combined metadata column
books_df['genres'] = books_df['combined_metadata'].apply(infer_genre)


In [51]:
books_df

Unnamed: 0,index,book_id,title,isbn13,language_name,publisher_name,customer_id,order_id,author_name,published_year,combined_metadata,genre_cluster,genres
0,0,1,The World's First Love: Mary Mother of God,8987059752,United States English,Ignatius Press,408,816,Fulton J. Sheen,1996,The World's First Love: Mary Mother of God Fu...,7,Romance
1,1,2,The Illuminati,20049130001,English,Thomas Nelson,532,1064,Larry Burkett,2004,The Illuminati Larry Burkett Thomas Nelson,2,General
2,2,5,Cliffs Notes on Aristophanes' Lysistrata The ...,49086007763,English,Cliffs Notes,382,763,W. John Campbell,1983,Cliffs Notes on Aristophanes' Lysistrata The ...,2,General
3,3,5,Cliffs Notes on Aristophanes' Lysistrata The ...,49086007763,English,Cliffs Notes,505,3510,W. John Campbell,1983,Cliffs Notes on Aristophanes' Lysistrata The ...,2,General
4,4,6,Life Is a Dream and Other Spanish Classics (Er...,73999140774,English,Applause Theatre & Cinema Book Publishers,386,772,Eric Bentley,2000,Life Is a Dream and Other Spanish Classics (Er...,1,Fiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23417,23417,11123,Asfixia,9789875661707,English,Debolsillo,687,1373,Chuck Palahniuk,2006,Asfixia Chuck Palahniuk Debolsillo,2,General
23418,23418,11123,Asfixia,9789875661707,English,Debolsillo,11,7121,Chuck Palahniuk,2006,Asfixia Chuck Palahniuk Debolsillo,2,General
23419,23419,11124,El Dia Que Nietzsche Lloró,9789875801448,Spanish,Planeta,111,221,Irvin D. Yalom,2006,El Dia Que Nietzsche Lloró Irvin D. Yalom Planeta,2,General
23420,23420,11124,El Dia Que Nietzsche Lloró,9789875801448,Spanish,Planeta,69,138,Irvin D. Yalom,2006,El Dia Que Nietzsche Lloró Irvin D. Yalom Planeta,2,General


In [52]:
# drop publication_date Column
books_df.drop(["combined_metadata","genre_cluster"],axis=1,inplace=True)

In [38]:
# Save data farme

#books_df.to_csv("Final_data.csv",index=False)

# **Insert the Final Data Farme in to SQL Table**

In [None]:
import pandas as pd
import pyodbc

# Read Excel
df = pd.read_csv("Final_data.csv")

# Connect to SQL Server
conn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER=Sudhakar\\SQLEXPRESS01;DATABASE=Local_database;UID=sa;PWD=123')
cursor = conn.cursor()


# Insert data into SQL Server
for index, row in df.iterrows():
    cursor.execute(
        """
        INSERT INTO Book_Data (book_id, title, isbn13, language_name, publisher_name, customer_id, order_id, author_name, published_year, genres)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """,
        row['book_id'], row['title'], row['isbn13'], row['language_name'], row['publisher_name'],
        row['customer_id'], row['order_id'], row['author_name'], row['published_year'], row['genres']
    )

# Commit the transaction
conn.commit()


In [14]:
books_df.columns

Index(['index', 'book_id', 'title', 'isbn13', 'language_name',
       'publisher_name', 'customer_id', 'order_id', 'author_name',
       'cover_image_url', 'published_year', 'genres'],
      dtype='object')

In [40]:
books_df["genres"].unique()

array(['Romance', 'General', 'Mystery/Thriller', 'Technology',
       'Historical', 'Fantasy', "Children's Literature", 'Biography',
       'Poetry', 'Business/Economics', 'Fiction', 'Young Adult',
       'Non-Fiction', 'Horror', 'Adventure', 'Science Fiction',
       'Self-Help', 'Health/Fitness', 'Spiritual/Religion',
       'Education/Academic', 'Comedy/Humor'], dtype=object)

In [53]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

books_df['customer_id'] = encoder.fit_transform(books_df['customer_id'])
books_df['title'] = encoder.fit_transform(books_df['title'])
books_df['isbn13'] = encoder.fit_transform(books_df['isbn13'])
books_df['language_name'] = encoder.fit_transform(books_df['language_name'])
books_df['publisher_name'] = encoder.fit_transform(books_df['publisher_name'])
books_df['author_name'] = encoder.fit_transform(books_df['author_name'])
books_df['published_year'] = encoder.fit_transform(books_df['published_year'])
books_df['genres'] = encoder.fit_transform(books_df['genres'])


In [54]:
books_df

Unnamed: 0,index,book_id,title,isbn13,language_name,publisher_name,customer_id,order_id,author_name,published_year,genres
0,0,1,5183,17,20,690,407,816,1903,54,15
1,1,2,4391,0,6,1335,531,1064,3441,62,8
2,2,5,824,1,6,302,381,763,6078,41,8
3,3,5,824,1,6,302,504,3510,6078,41,8
4,4,6,2302,6,6,67,385,772,1713,58,7
...,...,...,...,...,...,...,...,...,...,...,...
23417,23417,11123,430,5719,6,372,686,1373,983,64,8
23418,23418,11123,430,5719,6,372,10,7121,983,64,8
23419,23419,11124,1261,5720,18,1048,110,221,2383,64,8
23420,23420,11124,1261,5720,18,1048,68,138,2383,64,8


In [55]:
x=books_df.drop("genres",axis=1)
#y=books_df["genres"]

In [56]:
# One-hot encoding for target labels (if target is multi-class classification)
num_classes = len(books_df['genres'].unique())
y = pd.get_dummies(books_df['genres']).values

In [57]:
from sklearn.model_selection import train_test_split

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [58]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((17566, 10), (5856, 10), (17566, 21), (5856, 21))

In [21]:
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# x_train = sc.fit_transform(x_train)
# x_test = sc.transform(x_test)

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import tensorflow as tf

In [59]:
# Model architecture
model = Sequential()

# Input layer: The number of features in `X`
model.add(Dense(128, input_dim=X_train.shape[1], activation="relu"))  # Input layer

# Hidden layers
model.add(Dense(64, activation="relu"))  # HL1
model.add(Dense(64, activation="relu"))  # HL2
model.add(Dense(32, activation="relu"))  # HL3
model.add(Dense(16, activation="relu"))  # HL4

# Output layer
model.add(Dense(num_classes, activation="softmax")) 

# Compile model
model.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=[
        tf.keras.metrics.F1Score(), 
        "accuracy",
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall(),
    ],
)

model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [62]:
# Train the model
model.fit(X_train,y_train,epochs=100,batch_size=32,validation_data=(X_test, y_test))

Epoch 1/100
[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6698 - f1_score: 0.0773 - loss: 1.3521 - precision_1: 0.6697 - recall_1: 0.6697 - val_accuracy: 0.6851 - val_f1_score: 0.0788 - val_loss: 1.3173 - val_precision_1: 0.6852 - val_recall_1: 0.6851
Epoch 2/100
[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6682 - f1_score: 0.0773 - loss: 1.3592 - precision_1: 0.6681 - recall_1: 0.6681 - val_accuracy: 0.6834 - val_f1_score: 0.0786 - val_loss: 1.3218 - val_precision_1: 0.6833 - val_recall_1: 0.6831
Epoch 3/100
[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6712 - f1_score: 0.0775 - loss: 1.3477 - precision_1: 0.6712 - recall_1: 0.6711 - val_accuracy: 0.6851 - val_f1_score: 0.0788 - val_loss: 1.3166 - val_precision_1: 0.6851 - val_recall_1: 0.6849
Epoch 4/100
[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6771 - f1_scor

<keras.src.callbacks.history.History at 0x1eb283d31d0>

In [63]:
# Evaluate the model
loss, f1_score, accuracy, precision, recall = model.evaluate(X_test, y_test)

print(f"Loss: {loss}")
print(f"F1 Score: {f1_score}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")


[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 889us/step - accuracy: 0.6993 - f1_score: 0.0798 - loss: 1.2793 - precision_1: 0.6993 - recall_1: 0.6993
Loss: 1.3170162439346313
F1 Score: [0.         0.         0.         0.         0.         0.
 0.         0.         0.69348395 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.9610778  0.        ]
Accuracy: 0.6851093173027039
Precision: 0.6851093173027039
Recall: 0.6851093173027039


# **Model New**

In [1]:
import pandas as pd
import pyodbc

# Connect to SQL Server
conn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER=Sudhakar\\SQLEXPRESS01;DATABASE=Local_database;UID=sa;PWD=123;Timeout=60')

# query
query = """
SELECT     
    a.title,     
    a.publication_date,      
    c.publisher_name,      
	h.author_name    
FROM 
    book a
JOIN 
    book_language b ON a.language_id = b.language_id
JOIN 
    publisher c  ON a.publisher_id = c.publisher_id
JOIN 
    order_line f ON a.book_id = f.book_id        
JOIN 
    cust_order e ON e.order_id = f.order_id      
JOIN 
    customer d ON d.customer_id = e.customer_id
JOIN
	book_author g ON a.book_id = g.book_id
JOIN 
	author h on g.author_id = h.author_id order by a.book_id asc;
"""

# Execute 
result = pd.read_sql_query(query, conn)
df = pd.DataFrame(result)
# Close connection
conn.close()

df


  result = pd.read_sql_query(query, conn)


Unnamed: 0,title,publication_date,publisher_name,author_name
0,The World's First Love: Mary Mother of God,1996-09-01,Ignatius Press,Fulton J. Sheen
1,The Illuminati,2004-10-04,Thomas Nelson,Larry Burkett
2,Cliffs Notes on Aristophanes' Lysistrata The ...,1983-12-29,Cliffs Notes,W. John Campbell
3,Cliffs Notes on Aristophanes' Lysistrata The ...,1983-12-29,Cliffs Notes,W. John Campbell
4,Life Is a Dream and Other Spanish Classics (Er...,2000-04-01,Applause Theatre & Cinema Book Publishers,Eric Bentley
...,...,...,...,...
23417,Asfixia,2006-09-01,Debolsillo,Chuck Palahniuk
23418,Asfixia,2006-09-01,Debolsillo,Chuck Palahniuk
23419,El Dia Que Nietzsche Lloró,2006-10-24,Planeta,Irvin D. Yalom
23420,El Dia Que Nietzsche Lloró,2006-10-24,Planeta,Irvin D. Yalom


In [2]:
# convert date time formate
df["publication_date"] = pd.to_datetime(df["publication_date"])
df["published_year"] = df["publication_date"].dt.year

# drop publication_date Column
df.drop("publication_date",axis=1,inplace=True)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Combine metadata
df['combined_metadata'] = df['title'] + " " + df['author_name'] + " " + df['publisher_name'] + " " + df['published_year'].astype(str)

# Vectorize text
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['combined_metadata'])

# Apply KMeans clustering
kmeans = KMeans(n_clusters=10, random_state=42)  
df['genre_cluster'] = kmeans.fit_predict(X)

# Inspect clusters to infer genres
for cluster_id in range(10):
    print(f"Cluster {cluster_id}:")
    print(df[df['genre_cluster'] == cluster_id]['title'].head(10))


In [4]:
def infer_genre(metadata):
    metadata = metadata.lower()

    # Expanded and more detailed keywords for each genre
    genre_keywords = {
        "Romance": [
            "romance", "love", "affair", "relationship", "valentine", "passion", "heart", "emotion", "couple", 
            "romantic", "love story", "wedding", "marriage", "dating", "charming", "kiss", "intimacy", "desire"
        ],
        "Science Fiction": [
            "sci-fi", "science fiction", "space", "alien", "robot", "futuristic", "time travel", "space travel", 
            "virtual reality", "dystopia", "cyberpunk", "technology", "artificial intelligence", "quantum", "parallel universe"
        ],
        "Fantasy": [
            "fantasy", "magic", "wizard", "dragon", "myth", "sorcery", "elves", "fairy", "mystical", "creature", 
            "supernatural", "kingdom", "quest", "enchanted", "witch", "spell", "fairy tale", "hero"
        ],
        "Mystery/Thriller": [
            "mystery", "thriller", "detective", "crime", "investigation", "murder", "suspense", "secret", "conspiracy", 
            "puzzle", "detective story", "whodunit", "investigative", "chase", "clue", "intrigue", "plot twist", "tension"
        ],
        "Historical": [
            "history", "historical", "past", "war", "empire", "ancient", "renaissance", "civil war", "world war", 
            "medieval", "victorian", "classical", "legend", "conquest", "revolution", "colonial", "historical fiction"
        ],
        "Horror": [
            "horror", "ghost", "supernatural", "haunted", "fear", "zombie", "vampire", "monster", "creature", 
            "dark", "nightmare", "terror", "paranormal", "occult", "haunting", "dread", "blood", "gore", "spooky"
        ],
        "Biography": [
            "biography", "memoir", "life story", "autobiography", "personal", "inspiration", "life history", "success", 
            "legacy", "true story", "journey", "story of", "self-made", "entrepreneur", "famous person", "real life"
        ],
        "Self-Help": [
            "self-help", "motivation", "personal development", "improvement", "success", "habit", "mindset", 
            "empowerment", "life coaching", "productivity", "positive thinking", "confidence", "leadership", 
            "inspiration", "well-being", "personal growth", "mental health"
        ],
        "Children's Literature": [
            "children", "kids", "juvenile", "fairy tale", "nursery", "storybook", "picture book", "bedtime story", 
            "family", "play", "imagination", "fun", "adventure", "learning", "animals", "magical", "cartoon"
        ],
        "Adventure": [
            "adventure", "exploration", "journey", "expedition", "quest", "survival", "adventure story", "discovery", 
            "wild", "trek", "expedition", "travel", "outdoors", "danger", "brave", "heroic", "action"
        ],
        "Non-Fiction": [
            "non-fiction", "true story", "real life", "documentary", "facts", "reality", "history", "actual", "case study", 
            "biography", "memoir", "essays", "research", "journalism", "report", "analysis", "opinion", "reference"
        ],
        "Fiction": [
            "fiction", "novel", "story", "literature", "imagination", "creative writing", "narrative", "characters", 
            "plot", "setting", "dramatic", "prose", "novelistic", "fictional", "adventure", "fantasy"
        ],
        "Poetry": [
            "poetry", "poem", "verse", "lyric", "haiku", "ballad", "ode", "rhyme", "meter", "stanza", "poetic", 
            "rhyme scheme", "prose poetry", "lyrical", "sonnets", "verses"
        ],
        "Young Adult": [
            "young adult", "teen", "high school", "coming of age", "adolescence", "youth", "teenage", "teen fiction", 
            "teen romance", "teen drama", "puberty", "teen adventure", "coming-of-age", "self-discovery"
        ],
        "Spiritual/Religion": [
            "spiritual", "religion", "faith", "philosophy", "bible", "belief", "god", "heaven", "soul", "meditation", 
            "prayer", "religious", "divine", "enlightenment", "spirituality", "faith-based", "scripture"
        ],
        "Comedy/Humor": [
            "comedy", "humor", "funny", "satire", "parody", "joke", "laugh", "comedic", "ridiculous", "stand-up", 
            "funny story", "jokes", "laughter", "funny book", "hilarious", "absurd"
        ],
        "Business/Economics": [
            "business", "economics", "management", "finance", "entrepreneurship", "leadership", "marketing", 
            "strategy", "economy", "corporate", "startup", "success", "investment", "financial", "stocks", 
            "financial planning", "business growth"
        ],
        "Technology": [
            "technology", "programming", "software", "ai", "artificial intelligence", "computing", "robotics", 
            "machine learning", "coding", "development", "innovation", "tech", "gadgets", "engineering", "internet", 
            "hardware"
        ],
        "Education/Academic": [
            "education", "academic", "study", "research", "textbook", "learning", "teaching", "school", "university", 
            "research paper", "scholar", "curriculum", "theory", "classroom", "academic writing", "education system"
        ],
        "Health/Fitness": [
            "health", "fitness", "wellness", "diet", "exercise", "nutrition", "mental health", "workout", "body", 
            "strength", "cardio", "weight loss", "healthy lifestyle", "well-being", "meditation", "wellness journey"
        ]
    }

    # Search for keywords in metadata
    for genre, keywords in genre_keywords.items():
        for keyword in keywords:
            if keyword in metadata:
                return genre

    # Default genre if no match is found
    return "General"


# Apply the function to the combined metadata column
df['genres'] = df['combined_metadata'].apply(infer_genre)


In [5]:
df

Unnamed: 0,title,publisher_name,author_name,published_year,combined_metadata,genre_cluster,genres
0,The World's First Love: Mary Mother of God,Ignatius Press,Fulton J. Sheen,1996,The World's First Love: Mary Mother of God Fu...,9,Romance
1,The Illuminati,Thomas Nelson,Larry Burkett,2004,The Illuminati Larry Burkett Thomas Nelson 2004,5,General
2,Cliffs Notes on Aristophanes' Lysistrata The ...,Cliffs Notes,W. John Campbell,1983,Cliffs Notes on Aristophanes' Lysistrata The ...,1,General
3,Cliffs Notes on Aristophanes' Lysistrata The ...,Cliffs Notes,W. John Campbell,1983,Cliffs Notes on Aristophanes' Lysistrata The ...,1,General
4,Life Is a Dream and Other Spanish Classics (Er...,Applause Theatre & Cinema Book Publishers,Eric Bentley,2000,Life Is a Dream and Other Spanish Classics (Er...,7,Fiction
...,...,...,...,...,...,...,...
23417,Asfixia,Debolsillo,Chuck Palahniuk,2006,Asfixia Chuck Palahniuk Debolsillo 2006,4,General
23418,Asfixia,Debolsillo,Chuck Palahniuk,2006,Asfixia Chuck Palahniuk Debolsillo 2006,4,General
23419,El Dia Que Nietzsche Lloró,Planeta,Irvin D. Yalom,2006,El Dia Que Nietzsche Lloró Irvin D. Yalom Plan...,4,General
23420,El Dia Que Nietzsche Lloró,Planeta,Irvin D. Yalom,2006,El Dia Que Nietzsche Lloró Irvin D. Yalom Plan...,4,General


In [6]:
# Drop Columns
df.drop(["title","genre_cluster","publisher_name","author_name","published_year"],axis=1,inplace=True)

In [7]:
df

Unnamed: 0,combined_metadata,genres
0,The World's First Love: Mary Mother of God Fu...,Romance
1,The Illuminati Larry Burkett Thomas Nelson 2004,General
2,Cliffs Notes on Aristophanes' Lysistrata The ...,General
3,Cliffs Notes on Aristophanes' Lysistrata The ...,General
4,Life Is a Dream and Other Spanish Classics (Er...,Fiction
...,...,...
23417,Asfixia Chuck Palahniuk Debolsillo 2006,General
23418,Asfixia Chuck Palahniuk Debolsillo 2006,General
23419,El Dia Que Nietzsche Lloró Irvin D. Yalom Plan...,General
23420,El Dia Que Nietzsche Lloró Irvin D. Yalom Plan...,General


In [8]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding

In [9]:
X = df["combined_metadata"]
y = df["genres"]

In [10]:
# Text processing (TF-IDF example)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X).toarray()

In [11]:
# Label encoding or multi-label binarization
label_binarizer = LabelBinarizer()
y_encoded = label_binarizer.fit_transform(y)

In [12]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)


In [13]:
# Model
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_tfidf.shape[1],)),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(y_encoded.shape[1], activation='sigmoid') 
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [14]:
# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Compile model
model.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=[
        tf.keras.metrics.F1Score(), 
        "accuracy",
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall(),
    ],
)

model.summary()

In [15]:
# Training
model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test), batch_size=32)

Epoch 1/50
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 58ms/step - accuracy: 0.7088 - f1_score: 0.1955 - loss: 1.2883 - precision: 0.2002 - recall: 0.8498 - val_accuracy: 0.9381 - val_f1_score: 0.6768 - val_loss: 0.2775 - val_precision: 0.4592 - val_recall: 0.9714
Epoch 2/50
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 51ms/step - accuracy: 0.9570 - f1_score: 0.7437 - loss: 0.1802 - precision: 0.4094 - recall: 0.9835 - val_accuracy: 0.9624 - val_f1_score: 0.8919 - val_loss: 0.1701 - val_precision: 0.4974 - val_recall: 0.9855
Epoch 3/50
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 52ms/step - accuracy: 0.9855 - f1_score: 0.9360 - loss: 0.0575 - precision: 0.4499 - recall: 0.9985 - val_accuracy: 0.9626 - val_f1_score: 0.9243 - val_loss: 0.1665 - val_precision: 0.4654 - val_recall: 0.9868
Epoch 4/50
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 51ms/step - accuracy: 0.9926 - f1_score: 0.9796 - loss: 0.

<keras.src.callbacks.history.History at 0x174c1e43560>

In [16]:
# Evaluate the model
loss, f1_score, accuracy, precision, recall = model.evaluate(X_test, y_test)

print(f"Loss: {loss}")
print(f"F1 Score: {f1_score}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")


[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9648 - f1_score: 0.9053 - loss: 0.5165 - precision: 0.9458 - recall: 0.9659
Loss: 0.4203730821609497
F1 Score: [0.93749994 0.9272727  0.88888884 0.94883716 1.         0.93220335
 0.9629629  0.97435886 0.96471983 0.8571428  0.9401709  0.9789473
 0.9621621  0.96       0.9356223  0.9137055  0.9473684  0.8
 0.9103448  0.98277897 0.9333333 ]
Accuracy: 0.9649946689605713
Precision: 0.9449541568756104
Recall: 0.9673425555229187


In [None]:
# Save the model
model.save("book_genre_model.keras")

In [None]:
import pickle

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)
with open("label_binarizer.pkl", "wb") as f:
    pickle.dump(label_binarizer, f)


# **Test the Model with Some Sample Data (Manual Testing)**

In [17]:
import numpy as np
from tensorflow.keras.models import load_model # type: ignore
import pickle

# Load the trained model, vectorizer, and label binarizer
model = load_model("book_genre_model.keras")
with open("vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)
with open("label_binarizer.pkl", "rb") as f:
    label_binarizer = pickle.load(f)

# Sample input for testing
title = "The Great Adventure"
author = "John Doe"
publisher_name = "Adventure Press"
published_year = "2023"

# Prepare the input data by combining the fields
def prepare_input(title=None, author=None, publisher=None, year=None):
    title = title if title else ""
    author = author if author else ""
    publisher = publisher if publisher else ""
    year = year if year else "Unknown"
    return f"{title} {author} {publisher} {year}"

# Combine the fields
single_input = prepare_input(title=title, author=author, publisher=publisher_name, year=published_year)

# Vectorize the input using the same vectorizer as used during training
single_input_tfidf = vectorizer.transform([single_input]).toarray()

# Predict the genre
prediction = model.predict(single_input_tfidf)

# Decode the prediction output using the label_binarizer
predicted_genre = label_binarizer.inverse_transform((prediction > 0.5).astype(int))

# Print the predicted genre(s)
print(f"Predicted Genres: {', '.join(predicted_genre)}" if len(predicted_genre) > 0 else "No Genre Predicted")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
Predicted Genres: Children's Literature
