In [1]:
import pandas as pd
import re
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:

file_path = 'bookData.csv'
df = pd.read_csv(file_path, encoding='iso-8859-1')


synopses = df['Synopsis'].fillna('')
publishers = df['publisher']
synopses += ' ' + publishers


def format(text):
    text = text.replace("'", "").replace("-", " ")
    words = re.findall(r'\b\w+\b', text.lower())
    return [lemmatizer.lemmatize(word) for word in words]


word_set = set()
for doc in synopses:
    words = format(doc)
    word_set.update(words)

unique_words = sorted(word_set)

word_index = {word: idx for idx, word in enumerate(unique_words)}

num_docs = len(synopses)
num_words = len(unique_words)
matrix = [[0] * num_words for _ in range(num_docs)]

for doc_idx, doc in enumerate(synopses):
    words = format(doc)
    for word in words:
        col_idx = word_index[word]
        matrix[doc_idx][col_idx] += 1

term_matrix_df = pd.DataFrame(matrix, columns=unique_words)


if 'isbn' in df.columns:
    term_matrix_df.insert(0, 'ISBN', df['isbn'])
else:
    print("Column 'isbn' not found in the DataFrame")

term_matrix_df

Unnamed: 0,ISBN,10,103,12,14th,150,16,1896,1920s,1925,...,york,yorker,you,young,younger,your,zellweger,zilpah,zoo,zurer
0,440234743,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,971880107,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,345417623,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,446310786,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,671027360,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,439064864,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
110,043935806X,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
111,440220602,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
112,671001795,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Calculate term frequencies (TF)
tf_matrix = []
for i in range(num_docs):
    row_sum = sum(matrix[i])
    if row_sum == 0:
        tf_matrix.append([0] * num_words)
    else:
        tf_matrix.append([count / row_sum for count in matrix[i]])

# Calculate document frequencies (DF)
df_count = [0] * num_words
for j in range(num_words):
    for i in range(num_docs):
        if matrix[i][j] > 0:
            df_count[j] += 1

# Calculate inverse document frequency (IDF)
idf = [0] * num_words
for j in range(num_words):
    idf_value = (1 + num_docs) / (1 + df_count[j])

    # Custom logarithm function (natural log)
    log_value = 0
    x = (idf_value - 1) / (idf_value + 1)
    iteration_count = 20  # Number of terms to sum in the series (higher is more accurate)

    for n in range(1, iteration_count * 2, 2):
        log_value += (x**n) / n

    idf[j] = 1 + 2 * log_value  # 2 * log(x) approximates the natural log of (1 + x)

# Calculate TF-IDF manually
tfidf_matrix = []
for i in range(num_docs):
    tfidf_row = []
    for j in range(num_words):
        tfidf_value = tf_matrix[i][j] * idf[j]
        tfidf_row.append(tfidf_value)
    tfidf_matrix.append(tfidf_row)

tfidf_df = pd.DataFrame(tfidf_matrix, columns=unique_words)

if 'isbn' in df.columns:
    tfidf_df.insert(0, 'ISBN', df['isbn'])
else:
    print("Column 'isbn' not found in the DataFrame")

# output_file_path = 'tfidf_matrix1.csv'
# tfidf_df.to_csv(output_file_path, index=False)
tfidf_df



Unnamed: 0,ISBN,10,103,12,14th,150,16,1896,1920s,1925,...,york,yorker,you,young,younger,your,zellweger,zilpah,zoo,zurer
0,440234743,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000
1,971880107,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.080334,0.0,0.0,0.0,0.0,0.0,0.00000
2,345417623,0.0,0.0,0.000000,0.064922,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000
3,446310786,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000
4,671027360,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.07256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,439064864,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000
110,043935806X,0.0,0.0,0.045267,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.027269,0.0,0.0,0.0,0.0,0.0,0.00000
111,440220602,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000
112,671001795,0.0,0.0,0.000000,0.000000,0.112139,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00000


In [4]:
user_history_file_path = '/content/UserHistoricalView.csv'
user_history_df = pd.read_csv(user_history_file_path)
user_history_df.head()

result = {}
for userid in user_history_df['userid'].unique():
    user_isbns = user_history_df[user_history_df['userid'] == userid]['isbn'].tolist()
    corresponding_rows = []
    for isbn in user_isbns:
        row = tfidf_df[tfidf_df['ISBN'] == isbn]
        if not row.empty:
            corresponding_rows.append(row)
    if corresponding_rows:
        combined_row = pd.concat(corresponding_rows).sum()
        result[userid] = {'isbns': user_isbns, 'combined_row': combined_row}

output_rows = []
for userid, data in result.items():
    combined_row = data['combined_row']
    combined_row['userid'] = userid
    output_rows.append(combined_row)

output_df = pd.DataFrame(output_rows)
output_df.drop(columns=['ISBN'], inplace=True)

# Reorder the columns
output_df = output_df[['userid'] + [col for col in output_df.columns if col != 'userid']]

output_df.head()



# Save the output DataFrame to a CSV file
output_file_path = '/content/Part2_File1_Profile_Group7.csv'
output_df.to_csv(output_file_path, index=True)


In [5]:
import pandas as pd

def dot_product(v1, v2):
    return sum(x * y for x, y in zip(v1, v2))

def vector_norm(v):
    return sum(x ** 2 for x in v) ** 0.5

def cosine_similarity(v1, v2):
    dot_prod = dot_product(v1, v2)
    norm_v1 = vector_norm(v1)
    norm_v2 = vector_norm(v2)
    if norm_v1 == 0 or norm_v2 == 0:
        return 0  # Handle cases where a vector is all zeros
    return dot_prod / (norm_v1 * norm_v2)

# Extract user profiles and item vectors
user_profiles = output_df.set_index('userid').values.tolist()
item_vectors = tfidf_df.drop(columns=['ISBN']).values.tolist()

# Create a mask to filter out already read books
user_ids = output_df['userid'].values.tolist()
item_ids = tfidf_df['ISBN'].values.tolist()

num_users = len(user_profiles)
num_items = len(item_vectors)
similarity_matrix = [[0] * num_items for _ in range(num_users)]

for i in range(num_users):
    user_id = user_ids[i]
    read_books = user_history_df[user_history_df['userid'] == user_id]['isbn'].values.tolist()

    for j in range(num_items):
        book_id = item_ids[j]
        if book_id not in read_books:
            similarity_matrix[i][j] = cosine_similarity(user_profiles[i], item_vectors[j])
        else:
            similarity_matrix[i][j] = 0

similarity_df = pd.DataFrame(similarity_matrix,
                             index=output_df['userid'],
                             columns=tfidf_df['ISBN'])

#output_file_path = '/content/similarity_matrix1.csv'
output_file_path = '/content/Part2_File2_Model_Group7.csv'

similarity_df.to_csv(output_file_path, index=True)


In [6]:
similarity_df

ISBN,440234743,971880107,345417623,446310786,671027360,60976845,044021145X,60938455,446672211,142000205,...,375727345,312924585,684872153,316601950,439139597,439064864,043935806X,440220602,671001795,440222656
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11676,0.191029,0.147187,0.0,0.231365,0.182759,0.213336,0.229177,0.0,0.0,0.200096,...,0.125387,0.177176,0.188097,0.230109,0.183984,0.0,0.216841,0.156314,0.0,0.13297
16795,0.036566,0.029928,0.029657,0.080619,0.039451,0.052325,0.067684,0.025753,0.048731,0.065836,...,0.044937,0.028451,0.050374,0.068136,0.033444,0.055241,0.107201,0.038164,0.045155,0.039864
22625,0.164726,0.132584,0.140318,0.163478,0.16948,0.199054,0.221548,0.132304,0.135666,0.198274,...,0.150458,0.135189,0.159141,0.0,0.161149,0.0,0.0,0.127984,0.121075,0.141479
35859,0.209272,0.145236,0.115314,0.179073,0.138981,0.198614,0.0,0.103294,0.121453,0.153736,...,0.099983,0.149533,0.203659,0.169316,0.143435,0.23009,0.209448,0.151425,0.14733,0.135275
95359,0.162871,0.12848,0.155836,0.0,0.123489,0.187253,0.189103,0.138695,0.143043,0.169302,...,0.10611,0.142143,0.154014,0.0,0.150702,0.165955,0.0,0.11179,0.169509,0.086999
104636,0.190746,0.127961,0.13245,0.147872,0.136943,0.158015,0.0,0.087683,0.0,0.113197,...,0.087494,0.143553,0.152878,0.129308,0.187263,0.264889,0.319328,0.151518,0.114389,0.126748
110912,0.172254,0.125092,0.150686,0.228842,0.175565,0.199868,0.209219,0.177325,0.127353,0.189644,...,0.108448,0.134832,0.141726,0.181765,0.0,0.0,0.205134,0.13866,0.191973,0.124349
204864,0.214402,0.152313,0.134376,0.237336,0.173566,0.202612,0.239934,0.152741,0.125101,0.202731,...,0.139971,0.177573,0.272118,0.167832,0.169598,0.205415,0.175368,0.136612,0.175619,0.119558
271448,0.1561,0.133632,0.093455,0.13595,0.106772,0.208407,0.224568,0.080481,0.0,0.176722,...,0.159947,0.147887,0.179116,0.162921,0.155479,0.195538,0.0,0.127341,0.087911,0.137151


In [7]:
top_n = 10
recommendations = []
recommendations2 = {}

book_data_df = pd.read_csv('/content/bookData.csv', encoding='iso-8859-1')

for user in similarity_df.index:
    top_items = similarity_df.loc[user].sort_values(ascending=False).head(top_n)
    recommendations2[user] = top_items.index.tolist()

    for isbn, similarity in top_items.items():
        book_title = book_data_df[book_data_df['isbn'] == isbn]['booktitle'].values[0]
        recommendations.append({
            'User ID': user,
            'Book\'s ISBN': isbn,
            'Book\'s Title': book_title,
            'Model\'s Calculated Value': similarity
        })

recommendations_df = pd.DataFrame(recommendations)

output_file_path = '/content/Part2_File3_Recommendation_Group7.csv'
recommendations_df.to_csv(output_file_path, index=False)


recommendations_df


Unnamed: 0,User ID,Book's ISBN,Book's Title,Model's Calculated Value
0,11676,590353403,Harry Potter and the Sorcerer s Stone (Book 1),0.346404
1,11676,385722206,Balzac and the Little Chinese Seamstress,0.314497
2,11676,345337662,Interview with the Vampire,0.301223
3,11676,1400034779,The No. 1 Ladies Detective Agency (Today Show...,0.291726
4,11676,014028009X,Bridget Jones s Diary,0.286780
...,...,...,...,...
85,271448,044021145X,The Firm,0.224568
86,271448,014028009X,Bridget Jones s Diary,0.221735
87,271448,059035342X,Harry Potter and the Sorcerer s Stone (Harry P...,0.220563
88,271448,590353403,Harry Potter and the Sorcerer s Stone (Book 1),0.219570


In [8]:
test_answers_df = pd.read_csv('/content/TestUserAnswers.csv')

grouped_isbns = test_answers_df.groupby('userid')['isbn'].apply(list).to_dict()

# Evaluate recommendations
correctly_predicted = {}
incorrectly_predicted = {}

for user in recommendations2:
  if user in grouped_isbns:
    correct_predictions = [book for book in recommendations2[user] if book in grouped_isbns[user]]
    incorrect_predictions = [book for book in recommendations2[user] if book not in grouped_isbns[user]]

    correctly_predicted[user] = correct_predictions
    incorrectly_predicted[user] = incorrect_predictions

print("Correctly Predicted Books:")
for user, books in correctly_predicted.items():
  print(f"User {user}: {books}")

print("\nIncorrectly Predicted Books:")
for user, books in incorrectly_predicted.items():
  print(f"User {user}: {books}")


Correctly Predicted Books:
User 11676: ['590353403', '385722206', '345337662', '014028009X', '375726403', '439064872', '60987103']
User 16795: ['316569321', '440214041', '316769487', '61009059']
User 22625: ['385722206', '142001740']
User 35859: ['385722206']
User 95359: ['60915544', '312195516']
User 104636: ['440213525', '440224764', '014028009X']
User 110912: ['60959037']
User 204864: ['385722206', '375726403']
User 271448: ['316569321', '312195516']

Incorrectly Predicted Books:
User 11676: ['1400034779', '316769487', '60959037']
User 16795: ['043935806X', '014028009X', '385335482', '316096199', '446310786', '345337662']
User 22625: ['590353403', '439064872', '014028009X', '385335482', '439139600', '440213525', '316569321', '61009059']
User 35859: ['590353403', '440213525', '440224764', '316769487', '345337662', '439064864', '439136350', '1400034779', '014028009X']
User 95359: ['60987103', '380789035', '385722206', '345337662', '1400034779', '014028009X', '059035342X', '449005615']

In [14]:
# Calculate metrics
total_relevant_items = sum(len(v) for v in grouped_isbns.values())
total_recommended_items = 90  # 10 recommendations per user

# Calculate precision
precision = sum(len(v) for v in correctly_predicted.values()) / total_recommended_items

# Calculate recall
recall = sum(len(v) for v in correctly_predicted.values()) / total_relevant_items

# Calculate F1-score
f1_score = 2 * (precision * recall) / (precision + recall)

 # Round the metrics to 4 decimal places
precision = round(precision, 4)
recall = round(recall, 4)
f1_score = round(f1_score, 4)

# Print metrics
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

# Store the results
metrics_df = pd.DataFrame([['Overall', precision, recall, f1_score]], columns=['Metric', 'Precision', 'Recall', 'F-measure'])
metrics_df.to_csv('Part2_File4_Evaluation_Group7.csv', index=False)
print("Overall metrics saved to 'Part2_File4_Evaluation_Group6.csv'")

Precision: 0.2667
Recall: 0.1491
F1-Score: 0.1912
Overall metrics saved to 'Part2_File4_Evaluation_Group6.csv'
