In [23]:
import pandas as pd

In [24]:
df = pd.read_csv("books.csv", on_bad_lines='skip', engine='python')


In [25]:
df['language_code'].unique()

array(['eng', 'en-US', 'fre', 'spa', 'en-GB', 'mul', 'grc', 'enm',
       'en-CA', 'ger', 'jpn', 'ara', 'nl', 'zho', 'lat', 'por', 'srp',
       'ita', 'rus', 'msa', 'glg', 'wel', 'swe', 'nor', 'tur', 'gla',
       'ale'], dtype=object)

In [26]:
df['language_code'] = df['language_code'].apply(
    lambda x: 'eng' if str(x).startswith('en') or str(x) == 'eng' else x
)


In [27]:
df['language_code'].unique()

array(['eng', 'fre', 'spa', 'mul', 'grc', 'ger', 'jpn', 'ara', 'nl',
       'zho', 'lat', 'por', 'srp', 'ita', 'rus', 'msa', 'glg', 'wel',
       'swe', 'nor', 'tur', 'gla', 'ale'], dtype=object)

In [28]:
df.shape

(11119, 12)

In [29]:
print(df.columns)

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', '  num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher'],
      dtype='object')


In [30]:
df.columns = df.columns.str.strip()


In [31]:
print(df.columns)

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', 'num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher'],
      dtype='object')


In [32]:
df['title'] = df['title'].fillna("")
df['authors'] = df['authors'].fillna("")
df['publisher'] = df['publisher'].fillna("")
df['language_code'] = df['language_code'].fillna("")


In [33]:
df['content'] = (
    df['title'] + " " +
    df['authors'] + " " +
    df['publisher'] + " " +
    df['language_code']
)


In [34]:
df['content'].sample(5)

7617           Wind Child Nancy   Harding Pocket Books eng
10773    The Church in Emerging Culture: Five Perspecti...
8160     Blood And Gold (The Vampire Chronicles  #8) An...
2        Harry Potter and the Chamber of Secrets (Harry...
771      Baltasar and Blimunda JosÃ© Saramago/Giovanni P...
Name: content, dtype: object

In [35]:
df = df.dropna(subset=['content'])


In [36]:
df = df[df['content'].str.len() > 50]


In [37]:
df.shape

(9449, 13)

In [38]:
non_eng_count = (df['language_code'] != 'eng').sum()
print("Non-English rows:", non_eng_count)


Non-English rows: 440


In [39]:
df = df[df['language_code'] == 'eng']


In [40]:
df.shape

(9009, 13)

In [41]:
df['language_code'].value_counts()


language_code
eng    9009
Name: count, dtype: int64

In [42]:
df = df.dropna(subset=['content'])


In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=5000
)

tfidf_matrix = vectorizer.fit_transform(df['content'])


In [46]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tfidf_matrix)


In [49]:
def recommend_books(book_title, df, similarity_matrix, top_n=5):
    # Find matches (case-insensitive, partial match)
    matches = df[df['title'].str.contains(book_title, case=False, na=False)]
    
    if matches.empty:
        return f"No book found matching '{book_title}'"
    
    idx = matches.index[0]  # take first match
    
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    top_books = sim_scores[1:top_n+1]
    
    return df.iloc[[i[0] for i in top_books]][['title', 'authors', 'average_rating']]


In [50]:
recommend_books("hobbit", df, similarity_matrix)


Unnamed: 0,title,authors,average_rating
4282,The Lord of the Rings Millennium Edition Boxed...,J.R.R. Tolkien,4.5
4271,The Lord of the Rings / The Hobbit,J.R.R. Tolkien,4.59
724,The Lord of the Rings- 3 volumes set (The Lord...,J.R.R. Tolkien,4.5
1694,The Lord of the Rings (The Lord of the Rings ...,J.R.R. Tolkien,4.5
2041,The Lord of the Rings (The Lord of the Rings ...,J.R.R. Tolkien/Rob Inglis,4.5


In [52]:
df.to_csv("books_cleaned.csv", index=False)
