In [None]:
  import pandas as pd
  import re
  import numpy as np
  import nltk
  from nltk.stem import WordNetLemmatizer
  nltk.download('wordnet')
  lemmatizer = WordNetLemmatizer()


  file_path = '/content/drive/MyDrive/Recommender/bookData.csv'
  df = pd.read_csv(file_path, encoding='iso-8859-1')

  synopses = df['Synopsis'].fillna('')
  publishers = df['publisher']
  synopses += ' ' + publishers


  def format(text):
      text = text.replace("'", "").replace("-", " ")
      words = re.findall(r'\b\w+\b', text.lower())
      return [lemmatizer.lemmatize(word) for word in words]

  word_set = set()
  for doc in synopses:
      words = format(doc)
      word_set.update(words)

  unique_words = sorted(word_set)

  word_index = {word: idx for idx, word in enumerate(unique_words)}

  num_docs = len(synopses)
  num_words = len(unique_words)
  matrix = [[0] * num_words for i in range(num_docs)]

  for doc_idx, doc in enumerate(synopses):
      words = format(doc)
      for word in words:
          col_idx = word_index[word]
          matrix[doc_idx][col_idx] += 1

  term_matrix_df = pd.DataFrame(matrix, columns=unique_words)
  print(term_matrix_df.shape)
  if 'isbn' in df.columns:
      term_matrix_df.insert(0, 'ISBN', df['isbn'])
  else:
      print("Column 'isbn' not found in the DataFrame")

term_matrix_df.head()


[nltk_data] Downloading package wordnet to /root/nltk_data...


(114, 2243)


Unnamed: 0,ISBN,10,103,12,14th,150,16,1896,1920s,1925,...,york,yorker,you,young,younger,your,zellweger,zilpah,zoo,zurer
0,440234743,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,971880107,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,345417623,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,446310786,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,671027360,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
matrixTF = np.zeros((num_docs, num_words), dtype=float)

for i in range(num_docs):
    row_sum = np.sum(matrix[i])
    if row_sum == 0:
        matrixTF[i] = np.zeros(num_words)
    else:
        matrixTF[i] = matrix[i] / row_sum


matrix_np = np.array(matrix)
df_count = np.sum(matrix_np > 0, axis=0)

idf = np.log((num_docs+1) / (df_count+1)) + 1
tfidf = matrixTF * idf
tfidf_df = pd.DataFrame(tfidf, columns=unique_words)


if 'isbn' in df.columns:
    tfidf_df.insert(0, 'ISBN', df['isbn'])
else:
    print("Column 'isbn' not found in the DataFrame")

tfidf_df.head()


Unnamed: 0,ISBN,10,103,12,14th,150,16,1896,1920s,1925,...,york,yorker,you,young,younger,your,zellweger,zilpah,zoo,zurer
0,440234743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,971880107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.080334,0.0,0.0,0.0,0.0,0.0,0.0
2,345417623,0.0,0.0,0.0,0.066471,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,446310786,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,671027360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074291


In [None]:
user_history_file_path = '/content/drive/MyDrive/Recommender/UserHistoricalView.csv'
user_history_df = pd.read_csv(user_history_file_path)
user_history_df.head()

result = {}
for userid in user_history_df['userid'].unique():
    user_isbns = user_history_df[user_history_df['userid'] == userid]['isbn'].tolist()
    corresponding_rows = []
    for isbn in user_isbns:
        row = tfidf_df[tfidf_df['ISBN'] == isbn]
        if not row.empty:
            corresponding_rows.append(row)
    if corresponding_rows:
        combined_row = pd.concat(corresponding_rows).sum()
        result[userid] = {'isbns': user_isbns, 'combined_row': combined_row}

# Create a list to store combined rows for each user, then create the DataFrame
output_rows = []
for userid, data in result.items():
    combined_row = data['combined_row']
    combined_row['userid'] = userid
    # combined_row['isbn'] = ', '.join(data['isbns'])  # Combine all ISBNs into a single string
    output_rows.append(combined_row)

output_df = pd.DataFrame(output_rows)
output_df.drop(columns=['ISBN'], inplace=True)

# Reorder the columns
output_df = output_df[['userid'] + [col for col in output_df.columns if col != 'userid']]

output_df

# Save the output DataFrame to a CSV file
# output_file_path = '/content/user_profiles1.csv'
# output_df.to_csv(output_file_path, index=False)

Unnamed: 0,userid,10,103,12,14th,150,16,1896,1920s,1925,...,york,yorker,you,young,younger,your,zellweger,zilpah,zoo,zurer
0,11676,0.0,0.0,0.0,0.066471,0.114813,0.0,0.0,0.0,0.0,...,0.094321,0.0,0.0,0.0,0.0,0.0,0.0,0.142304,0.0,0.0
1,16795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,22625,0.0,0.099055,0.046347,0.0,0.0,0.0,0.0,0.0,0.0,...,0.070352,0.0,0.0,0.027269,0.0,0.0,0.0,0.0,0.0,0.0
3,35859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.036248,0.0,0.0,0.0,0.142304,0.0,0.0
4,95359,0.0,0.099055,0.046347,0.0,0.0,0.0,0.0,0.074291,0.074291,...,0.0,0.074291,0.0,0.07098,0.0,0.0,0.0,0.0,0.0,0.0
5,104636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.036248,0.0,0.0,0.0,0.0,0.0,0.0
6,110912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.09715,0.0,0.0,0.0,0.059568,0.0,0.0,0.0
7,204864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.14341,0.0,0.0,0.034562,0.0,0.0,0.054027,0.142304,0.054911,0.0
8,271448,0.0,0.0,0.046347,0.0,0.0,0.0,0.0,0.0,0.0,...,0.050654,0.0,0.0,0.089193,0.0,0.120281,0.0,0.0,0.0,0.0


In [None]:
import numpy as np
import pandas as pd

def cosine_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    if norm_v1 == 0 or norm_v2 == 0:
        return 0
    return dot_product / (norm_v1 * norm_v2)

# Extract user profiles and item vectors
user_profiles = output_df.set_index('userid').values
item_vectors = tfidf_df.drop(columns=['ISBN']).values


# Create a mask to filter out already read books
user_ids = output_df['userid'].values
item_ids = tfidf_df['ISBN'].values

# Initialize the similarity matrix
num_users = user_profiles.shape[0]
num_items = item_vectors.shape[0]
similarity_matrix = np.zeros((num_users, num_items))

for i in range(num_users):
    user_id = user_ids[i]

    read_books = user_history_df[user_history_df['userid'] == user_id]['isbn'].values
    # print(read_books)

    for j in range(num_items):
        book_id = item_ids[j]

        if book_id not in read_books:
            similarity_matrix[i, j] = cosine_similarity(user_profiles[i], item_vectors[j])
        else:
            similarity_matrix[i, j] = 0


similarity_df = pd.DataFrame(similarity_matrix,
                             index=output_df['userid'],
                             columns=tfidf_df['ISBN'])


# similarity_df.head()
# Save the output DataFrame to a CSV file
# output_file_path = '/content/similarity_matrix1.csv'
# similarity_df.to_csv(output_file_path, index=False)


In [None]:
recommendations_output = []
for user in similarity_df.index:
    # Get top 5 recommendations for the user
    top_5_books = similarity_df.loc[user].sort_values(ascending=False).head(11).index.tolist()[1:]


    for book in top_5_books:
        # Get the book title from the original DataFrame 'df'
        book_title = df[df['isbn'] == book]['booktitle'].values[0] if not df[df['isbn'] == book]['booktitle'].empty else ''

        # Get the similarity value
        similarity_value = similarity_df.loc[user, book]

        recommendations_output.append([user, book, book_title, similarity_value])

# Create a DataFrame from the recommendations output
recommendations_df = pd.DataFrame(recommendations_output, columns=['User ID', 'Book’s ISBN', 'Book’s Title', 'similarity values'])

# Save the recommendations DataFrame to a CSV file
output_file_path = '/content/Part2_File3_Recommendation_Group7.csv'
recommendations_df.to_csv(output_file_path, index=False)

# Display the recommendations DataFrame
# print(recommendations_df)


In [None]:
top_n = 10
recommendations = {}
for user in similarity_df.index:
  # Sort items by similarity for the current user and get top N
  top_items = similarity_df.loc[user].sort_values(ascending=False).head(top_n)
  recommendations[user] = top_items.index.tolist()

# Display recommendations
for user, books in recommendations.items():
  print(f"Top {top_n} recommendations for user {user}: {books}")


Top 10 recommendations for user 11676: ['590353403', '385722206', '345337662', '1400034779', '014028009X', '316769487', '375726403', '439064872', '60987103', '60959037']
Top 10 recommendations for user 16795: ['043935806X', '014028009X', '316569321', '440214041', '385335482', '316096199', '446310786', '316769487', '345337662', '61009059']
Top 10 recommendations for user 22625: ['590353403', '439064872', '385722206', '014028009X', '385335482', '439139600', '440213525', '316569321', '61009059', '142001740']
Top 10 recommendations for user 35859: ['590353403', '440213525', '440224764', '316769487', '439064864', '345337662', '439136350', '1400034779', '014028009X', '385722206']
Top 10 recommendations for user 95359: ['60987103', '380789035', '385722206', '345337662', '1400034779', '014028009X', '059035342X', '60915544', '312195516', '449005615']
Top 10 recommendations for user 104636: ['590353403', '439136350', '043935806X', '439064864', '440213525', '440224764', '446608955', '385722206', 

In [13]:
# Read testUserAnswers.csv
test_answers_df = pd.read_csv('/content/drive/MyDrive/Recommender/TestUserAnswers.csv')

# Group ISBNs by userid
grouped_isbns = test_answers_df.groupby('userid')['isbn'].apply(list).to_dict()

# Display grouped ISBNs
for user, isbns in grouped_isbns.items():
  print(f"User {user}: {isbns}")

# Evaluate recommendations
correctly_predicted = {}
incorrectly_predicted = {}

for user in recommendations:
  if user in grouped_isbns:
    correct_predictions = [book for book in recommendations[user] if book in grouped_isbns[user]]
    incorrect_predictions = [book for book in recommendations[user] if book not in grouped_isbns[user]]

    correctly_predicted[user] = correct_predictions
    incorrectly_predicted[user] = incorrect_predictions

# Print results
print("Correctly Predicted Books:")
for user, books in correctly_predicted.items():
  print(f"User {user}: {books}")

print("\nIncorrectly Predicted Books:")
for user, books in incorrectly_predicted.items():
  print(f"User {user}: {books}")


User 11676: ['60502258', '60987103', '61009059', '014028009X', '142000205', '156027321', '312278586', '312291639', '312305060', '316776963', '345313860', '345337662', '345370775', '375706771', '375707972', '375726403', '375727345', '380789019', '385484518', '385504209', '385722206', '439064872', '439136369', '439139597', '440221471', '440225701', '440226430', '440234743', '440236673', '044023722X', '440241073', '446364193', '044651652X', '446605239', '446608955', '553572997', '590353403', '671510053', '684872153', '684874350', '743237188', '743418174', '786868716', '804114986', '812550706', '842329129', '971880107']
User 16795: ['60976845', '61009059', '142000205', '312195516', '312291639', '312305060', '316569321', '316769487', '345339703', '345342968', '375706771', '375727345', '440214041', '440222656', '440226430', '446364193', '044651652X', '446608955', '446610038', '446672211', '671001795', '068484477X', '684874350', '743237188', '743418174', '786868716', '842329129']
User 22625: 

In [14]:
# Calculate metrics
total_relevant_items = sum(len(v) for v in grouped_isbns.values())
total_recommended_items = 90  # 10 recommendations per user

# Calculate precision
precision = sum(len(v) for v in correctly_predicted.values()) / total_recommended_items

# Calculate recall
recall = sum(len(v) for v in correctly_predicted.values()) / total_relevant_items

# Calculate F1-score
f1_score = 2 * (precision * recall) / (precision + recall)

# Print metrics
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

# Store the results
# metrics_df = pd.DataFrame([['Overall', precision, recall, fmeasure]], columns=['Metric', 'Precision', 'Recall', 'F-measure'])
# metrics_df.to_csv('Part2_File4_Evaluation_Group6.csv', index=False)
# print("Overall metrics saved to 'Part2_File4_Evaluation_Group6.csv'")

Precision: 0.26666666666666666
Recall: 0.14906832298136646
F1-Score: 0.19123505976095617
