# Book Data Processing


In [1]:
  import pandas as pd
  import re
  import numpy as np
  import nltk
  from nltk.stem import WordNetLemmatizer
  nltk.download('wordnet')
  lemmatizer = WordNetLemmatizer()


  file_path = '/content/bookData.csv'
  df = pd.read_csv(file_path, encoding='iso-8859-1')


  synopses = df['Synopsis'].fillna('')


  def format(text):
      text = text.replace("'", "").replace("-", " ")
      words = re.findall(r'\b\w+\b', text.lower())
      return [lemmatizer.lemmatize(word) for word in words]



  word_set = set()
  for doc in synopses:
      words = format(doc)
      word_set.update(words)

  unique_words = sorted(word_set)

  word_index = {word: idx for idx, word in enumerate(unique_words)}

  num_docs = len(synopses)
  num_words = len(unique_words)
  matrix = [[0] * num_words for i in range(num_docs)]

  for doc_idx, doc in enumerate(synopses):
      words = format(doc)
      for word in words:
          col_idx = word_index[word]
          matrix[doc_idx][col_idx] = 1

  term_matrix_df = pd.DataFrame(matrix, columns=unique_words)
  print(term_matrix_df.shape)
  if 'isbn' in df.columns:
      term_matrix_df.insert(0, 'ISBN', df['isbn'])
  else:
      print("Column 'isbn' not found in the DataFrame")

  term_matrix_df.head()



[nltk_data] Downloading package wordnet to /root/nltk_data...


(114, 2197)


Unnamed: 0,ISBN,10,103,12,14th,150,16,1896,1920s,1925,...,york,yorker,you,young,younger,your,zellweger,zilpah,zoo,zurer
0,440234743,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,971880107,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,345417623,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,446310786,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,671027360,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# User Data Profiling with Binary Feature

In [2]:

user_history_file_path = '/content/UserHistoricalView.csv'
user_history_df = pd.read_csv(user_history_file_path)
user_history_df.head()

result = {}
for userid in user_history_df['userid'].unique():
    user_isbns = user_history_df[user_history_df['userid'] == userid]['isbn'].tolist()
    corresponding_rows = []
    for isbn in user_isbns:
        row = term_matrix_df[term_matrix_df['ISBN'] == isbn]
        if not row.empty:
            corresponding_rows.append(row)
    if corresponding_rows:
        combined_row = pd.concat(corresponding_rows).sum()

        result[userid] = {'isbns': user_isbns, 'combined_row': combined_row}

# Create a list to store combined rows for each user, then create the DataFrame
output_rows = []
for userid, data in result.items():
    combined_row = data['combined_row']
    del combined_row['ISBN']
    for i in range(len(combined_row)):

      if int(combined_row[i]) > 0:
        combined_row[i] = 1
      else:
        combined_row[i] = 0
    combined_row['userid'] = userid
    output_rows.append(combined_row)

output_df = pd.DataFrame(output_rows)


# Reorder the columns
output_df = output_df[['userid'] + [col for col in output_df.columns if col != 'userid']]

#print(output_df)

# Save the output DataFrame to a CSV file
output_file_path = '/content/Part1_File1_Profile_Group7.csv'
output_df.to_csv(output_file_path, index=False)

  if int(combined_row[i]) > 0:
  combined_row[i] = 0
  combined_row[i] = 1
  if int(combined_row[i]) > 0:
  combined_row[i] = 0
  combined_row[i] = 1
  if int(combined_row[i]) > 0:
  combined_row[i] = 0
  combined_row[i] = 1
  if int(combined_row[i]) > 0:
  combined_row[i] = 0
  combined_row[i] = 1
  if int(combined_row[i]) > 0:
  combined_row[i] = 0
  combined_row[i] = 1
  if int(combined_row[i]) > 0:
  combined_row[i] = 0
  combined_row[i] = 1
  if int(combined_row[i]) > 0:
  combined_row[i] = 0
  combined_row[i] = 1
  if int(combined_row[i]) > 0:
  combined_row[i] = 0
  combined_row[i] = 1
  if int(combined_row[i]) > 0:
  combined_row[i] = 0
  combined_row[i] = 1


In [3]:
output_df

Unnamed: 0,userid,10,103,12,14th,150,16,1896,1920s,1925,...,york,yorker,you,young,younger,your,zellweger,zilpah,zoo,zurer
0,11676,0,0,0,1,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
1,16795,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22625,0,1,1,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
3,35859,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
4,95359,0,1,1,0,0,0,0,1,1,...,0,1,0,1,0,0,0,0,0,0
5,104636,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
6,110912,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
7,204864,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,1,1,0
8,271448,0,0,1,0,0,0,0,0,0,...,1,0,0,1,0,1,0,0,0,0


# Jaccard Similarity Calculation

In [4]:
import numpy as np
import pandas as pd

# def jaccard_similarity(v1, v2):
#     intersection = np.logical_and(v1, v2)
#     union = np.logical_or(v1, v2)
#     similarity = np.sum(intersection) / np.sum(union)
#     return similarity


def jaccard_similarity(v1, v2):
    # Initialize intersection and union counts
    intersection_count = 0
    union_count = 0

    # Iterate over both vectors simultaneously
    for i in range(len(v1)):
        # Convert to boolean to ensure exact comparison as done by np.logical_and and np.logical_or
        v1_bool = bool(v1[i])
        v2_bool = bool(v2[i])

        # Check for intersection (both are True/1)
        if v1_bool and v2_bool:
            intersection_count += 1

        # Check for union (at least one is True/1)
        if v1_bool or v2_bool:
            union_count += 1

    # Calculate Jaccard similarity
    if union_count == 0:
        return 0.0  # Handle edge case where both vectors are all zeros
    similarity = intersection_count / union_count
    return similarity





# Extract user profiles and item vectors
user_profiles = output_df.set_index('userid').values
item_vectors = term_matrix_df.drop(columns=['ISBN']).values

# Create a mask to filter out already read books
user_ids = output_df['userid'].values
item_ids = term_matrix_df['ISBN'].values

# Initialize the similarity matrix
num_users = user_profiles.shape[0]
num_items = item_vectors.shape[0]
similarity_matrix = np.zeros((num_users, num_items))

for i in range(num_users):
    user_id = user_ids[i]
    # Get the list of books already read by the current user
    read_books = user_history_df[user_history_df['userid'] == user_id]['isbn'].values
    # print(read_books)

    for j in range(num_items):
        book_id = item_ids[j]
        # Check if the book has already been read by the user
        if book_id not in read_books:
            similarity_matrix[i, j] = jaccard_similarity(user_profiles[i], item_vectors[j])
        else:
            similarity_matrix[i, j] = 0  # or np.nan or some other marker

# Create DataFrame for similarity matrix with user and item IDs
similarity_df = pd.DataFrame(similarity_matrix,
                             index=output_df['userid'],
                             columns=term_matrix_df['ISBN'])

# Ensure userid is treated as a string before appending it
similarity_df['userid'] = similarity_df.index.astype(str)

# Reorder the columns
similarity_df = similarity_df[['userid'] + [col for col in similarity_df.columns if col != 'userid']]

# Display the similarity matrix
#print(similarity_df)

# Save the output DataFrame to a CSV file
output_file_path = '/content/Part1_File2_SimMatrix_Group7.csv'
similarity_df.to_csv(output_file_path, index=False)


In [5]:
similarity_df

ISBN,userid,440234743,971880107,345417623,446310786,671027360,60976845,044021145X,60938455,446672211,...,375727345,312924585,684872153,316601950,439139597,439064864,043935806X,440220602,671001795,440222656
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11676,11676,0.038664,0.024648,0.0,0.038062,0.030822,0.021201,0.046552,0.0,0.0,...,0.021314,0.046312,0.035524,0.028021,0.026502,0.0,0.051926,0.023132,0.0,0.02669
16795,16795,0.053398,0.050505,0.044444,0.097087,0.045872,0.051546,0.093897,0.030151,0.053097,...,0.063492,0.048889,0.07732,0.070352,0.040201,0.063725,0.076271,0.046875,0.050505,0.051813
22625,22625,0.04336,0.041551,0.044041,0.045093,0.036649,0.030471,0.06366,0.021918,0.041026,...,0.033613,0.057592,0.047222,0.0,0.03022,0.0,0.0,0.030726,0.030137,0.042135
35859,35859,0.062731,0.048872,0.051546,0.064516,0.038194,0.045627,0.0,0.029851,0.047458,...,0.042146,0.069686,0.072797,0.044118,0.037313,0.066667,0.101695,0.042146,0.044944,0.045802
95359,95359,0.050595,0.036145,0.050992,0.0,0.028249,0.036585,0.063401,0.020958,0.047619,...,0.040123,0.059829,0.055046,0.0,0.033133,0.044379,0.0,0.033742,0.042424,0.027273
104636,104636,0.048,0.037838,0.045802,0.041451,0.035897,0.027027,0.0,0.021448,0.0,...,0.027248,0.05641,0.040541,0.037333,0.040761,0.065041,0.096939,0.032877,0.026738,0.038356
110912,110912,0.050992,0.037249,0.045699,0.058496,0.046575,0.040698,0.054496,0.031609,0.045333,...,0.028986,0.048387,0.052174,0.036723,0.0,0.0,0.070866,0.031977,0.037249,0.037791
204864,204864,0.052632,0.037135,0.034653,0.048718,0.035264,0.029255,0.061224,0.021053,0.034398,...,0.03504,0.071611,0.065217,0.028571,0.034483,0.049869,0.050481,0.026738,0.039894,0.029333
271448,271448,0.047486,0.045714,0.036939,0.032258,0.032086,0.028409,0.077135,0.019663,0.0,...,0.04058,0.045093,0.048571,0.039216,0.042857,0.062323,0.0,0.037572,0.019499,0.037356


# Recommending Top 5 books for each user

In [6]:
import pandas as pd

# Load the similarity matrix and book data
similarity_df = pd.read_csv('/content/Part1_File2_SimMatrix_Group7.csv')
book_data_df = pd.read_csv('/content/bookData.csv', encoding='iso-8859-1')
user_history_df = pd.read_csv('/content/UserHistoricalView.csv')

# Store recommendations for each user
recommendations = []
recommendations2 = {}

for user in similarity_df['userid'].unique():
    # Get all books and their similarity scores for the current user
    user_similarity_scores = similarity_df[similarity_df['userid'] == user].drop(columns=['userid']).iloc[0]

    # Get the list of books the user has already read
    read_books = user_history_df[user_history_df['userid'] == user]['isbn'].tolist()

    # Filter out the books the user has already read
    unread_books = user_similarity_scores[~user_similarity_scores.index.isin(read_books)]

    # Sort the unread books by their similarity score in descending order
    top_books = unread_books.sort_values(ascending=False).head(5)
    recommendations2[user] = top_books.index.tolist()

    # Get book titles for the top 5 books and store the recommendation
    for isbn, similarity in top_books.items():
        book_title = book_data_df[book_data_df['isbn'] == isbn]['booktitle'].values[0]
        recommendations.append({
            'User ID': user,
            'Book\'s ISBN': isbn,
            'Book\'s Title': book_title,
            'Similarity Value': similarity
        })

# Convert the recommendations list into a DataFrame
recommendations_df = pd.DataFrame(recommendations)

# Save the DataFrame to a CSV file with the required headers
output_file_path = '/content/Part1_File3_Recommendation_Group7.csv'
recommendations_df.to_csv(output_file_path, index=False)

# Display the DataFrame
print(recommendations_df.head())


   User ID Book's ISBN                                       Book's Title  \
0    11676   590353403     Harry Potter and the Sorcerer s Stone (Book 1)   
1    11676   345337662                         Interview with the Vampire   
2    11676   439136369  Harry Potter and the Prisoner of Azkaban (Book 3)   
3    11676   440211727                                     A Time to Kill   
4    11676   439064872   Harry Potter and the Chamber of Secrets (Book 2)   

   Similarity Value  
0          0.114545  
1          0.064784  
2          0.061750  
3          0.056604  
4          0.054386  


In [7]:
recommendations_df

Unnamed: 0,User ID,Book's ISBN,Book's Title,Similarity Value
0,11676,590353403,Harry Potter and the Sorcerer s Stone (Book 1),0.114545
1,11676,345337662,Interview with the Vampire,0.064784
2,11676,439136369,Harry Potter and the Prisoner of Azkaban (Book 3),0.06175
3,11676,440211727,A Time to Kill,0.056604
4,11676,439064872,Harry Potter and the Chamber of Secrets (Book 2),0.054386
5,16795,446310786,To Kill a Mockingbird,0.097087
6,16795,014028009X,Bridget Jones s Diary,0.094787
7,16795,345337662,Interview with the Vampire,0.094262
8,16795,044021145X,The Firm,0.093897
9,16795,068484477X,STONES FROM THE RIVER,0.090452
