In [None]:
import numpy as np
import pandas as pd
import json
import tensorflow as tf
from tensorflow import keras
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/My Drive/266/booksummaries.txt'

df = pd.read_csv(file_path, delimiter='\t', encoding='utf-8', header=None)

# Define the column names
df.columns = ['ID', 'MID', 'Title', 'Author', 'Date', 'Genres', 'Summary']

# Function to parse JSON safely
def parse_json_safe(json_str):
    try:
        if isinstance(json_str, str):  # Check if the value is a string
            return json.loads(json_str)
        else:
            return {}  # Return an empty dict if not a string
    except json.JSONDecodeError:
        return {}  # Return an empty dict if JSON decoding fails

# Apply the function to the 'Genres' column
df['Genres'] = df['Genres'].apply(parse_json_safe)

# Function to extract genres from the parsed JSON
def extract_genres(genre_dict):
    if isinstance(genre_dict, dict):
        return list(genre_dict.values())
    return []

# Apply the function to the 'Genres' column to get lists of genres
df['genre_list'] = df['Genres'].apply(extract_genres)

df = df.drop(columns = ['Genres', 'MID'])

df.head(4)


Unnamed: 0,ID,Title,Author,Date,Summary,genre_list
0,620,Animal Farm,George Orwell,1945-08-17,"Old Major, the old boar on the Manor Farm, ca...","[Roman à clef, Satire, Children's literature, ..."
1,843,A Clockwork Orange,Anthony Burgess,1962,"Alex, a teenager living in near-future Englan...","[Science Fiction, Novella, Speculative fiction..."
2,986,The Plague,Albert Camus,1947,The text of The Plague is divided into five p...,"[Existentialism, Fiction, Absurdist fiction, N..."
3,1756,An Enquiry Concerning Human Understanding,David Hume,,The argument of the Enquiry proceeds by a ser...,[]


In [None]:
# Exploratory Analysis
len(df.index)

16559

In [None]:
# Unique genres
flattened_genres = [genre for sublist in df['genre_list'] for genre in sublist]

genre_counts = {genre: flattened_genres.count(genre) for genre in set(flattened_genres)}

# Convert to a DataFrame
genre_counts_df = pd.DataFrame(list(genre_counts.items()), columns=['Genre', 'Count'])

# Sort the DataFrame by count
sorted_genre_counts_df = genre_counts_df.sort_values(by='Count', ascending=False).reset_index(drop=True)

# Print the sorted DataFrame
print(sorted_genre_counts_df)

                   Genre  Count
0                Fiction   4747
1    Speculative fiction   4314
2        Science Fiction   2870
3                  Novel   2463
4                Fantasy   2413
..                   ...    ...
222     Indian chick lit      1
223  Fictional crossover      1
224      Utopian fiction      1
225             Pastiche      1
226          Field guide      1

[227 rows x 2 columns]


In [None]:
# remove samples with 0 genre tags
df = df[~(df['genre_list'].str.len() == 0)]

In [None]:
len(df.index)

12841

### Sample 5,000 Records so we predict 1,000.

In [None]:
random_seed = 42

# Sample 5000 records with the random seed
df = df.sample(5000, random_state=random_seed)

## **1.0 Bag of Words Baseline**

In [None]:
import nltk
import re
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


In [None]:
# function for text cleaning
def clean_text(text):
    # remove backslash-apostrophe
    text = re.sub("\'", "", text)
    # remove everything alphabets
    text = re.sub("[^a-zA-Z]"," ",text)
    # remove whitespaces
    text = ' '.join(text.split())
    # convert text to lowercase
    text = text.lower()

    return text

In [None]:
df['clean_summary'] = df['Summary'].apply(lambda x: clean_text(x))

In [None]:
df[['Summary', 'clean_summary']].sample(3)

Unnamed: 0,Summary,clean_summary
12110,"After the resurrection of Thomas Covenant, Co...",after the resurrection of thomas covenant cove...
8108,The story opens about a year after the events...,the story opens about a year after the events ...
12538,Though The End of Oil is not a chronological ...,though the end of oil is not a chronological h...


## **2.0 Drop stopwords, which don't contribute to signal**

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)


## TOGGLE THIS FOR THE STOPWORD OPTION

df['clean_summary'] = df['clean_summary'].apply(lambda x: remove_stopwords(x))

### Converting Text to Features

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['genre_list'])

# transform target variable
y = multilabel_binarizer.transform(df['genre_list'])

In [None]:
# split dataset into training and validation set
xtrain, xtest, ytrain, ytest = train_test_split(df['clean_summary'], y, test_size=0.2, random_state=9)

In [None]:
# create TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(xtrain)  # Corrected instance method call
X_test_tfidf = vectorizer.transform(xtest)

In [None]:
from sklearn.linear_model import LogisticRegression

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier

# Performance metric
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

In [None]:
# fit model on train data, using training X and Y (Plot summaries and genre vectors)
clf.fit(X_train_tfidf, ytrain)



In [None]:
# make predictions for validation set
y_pred = clf.predict(X_test_tfidf)

In [None]:
# evaluate performance

# Calculate accuracy
accuracy = accuracy_score(ytest, y_pred)
print(f'Accuracy: {accuracy}')

# Calculate precision
precision = precision_score(ytest, y_pred, average='micro')
print(f'Precision: {precision}')

# Calculate recall
recall = recall_score(ytest, y_pred, average='micro')
print(f'Recall: {recall}')

# f1 score
F1_score = f1_score(ytest, y_pred, average="micro")
print(f'F1 Score: {F1_score}')

Accuracy: 0.039
Precision: 0.7707182320441989
Recall: 0.11984536082474227
F1 Score: 0.20743494423791822


In [None]:
y_pred[100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
multilabel_binarizer.inverse_transform(y_pred)[100]

()

## **3.0 Including Threshold**

Noticed that Many Predictions have 0 genres. Updating the threshold value to be 0.3 dramatically improved the F1 score and the practical outputs (now predictions have genres).

In [None]:
t = 0.2 # threshold value

# predict probabilities
y_pred_prob = clf.predict_proba(X_test_tfidf)

y_pred_new = (y_pred_prob >= t).astype(int)

In [None]:
# evaluate performance

# Calculate accuracy
accuracy = accuracy_score(ytest, y_pred_new)
print(f'Accuracy: {accuracy}')

# Calculate precision
precision = precision_score(ytest, y_pred_new, average='micro')
print(f'Precision: {precision}')

# Calculate recall
recall = recall_score(ytest, y_pred_new, average='micro')
print(f'Recall: {recall}')

# f1 score
F1_score = f1_score(ytest, y_pred_new, average="micro")
print(f'F1 Score: {F1_score}')

Accuracy: 0.041
Precision: 0.4312916534433513
Recall: 0.5837628865979382
F1 Score: 0.49607592626391667


In [None]:
y_pred_new[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
multilabel_binarizer.inverse_transform(y_pred_new)[0]

('Fiction', 'Speculative fiction')

In [None]:
decoded_labels = multilabel_binarizer.inverse_transform(y_pred_new)

# Count the number of records with empty tuples
count_empty = sum(len(labels) == 0 for labels in decoded_labels)
print(count_empty)

0


# **Calculate Resulting Cosine Similarities**

### Cosine Similarities

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Function to get embeddings for a list of genres
def get_genre_embedding(genre):
    inputs = tokenizer(genre, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

all_genres = list(set(flattened_genres))

# Get embeddings for all unique genres
# genre_embeddings = get_genre_embeddings(all_genres)

def get_genres_embeddings(genres_list):
    return [get_genre_embedding(genre) for genre in genres_list]


# The inverse_transform genre lists are actually 1-D arrays of tuples. We need to convert to 2-D list of lists.
def listpadding(list_o_tuples):
    # Determine the maximum tuple length
    max_len = max(len(t) for t in list_o_tuples)

    # Pad the tuples to have the same length
    padded_genres = [t + ('',) * (max_len - len(t)) for t in list_o_tuples]

    # Convert to a 2-D numpy array
    return np.array(padded_genres)


#### Cosine Similarities w/ Actual Examples

In [None]:
# Compute cosine similarities for each pair of actual and predicted genres
def calculate_cosine_similarities(y_actual, y_pred):
    similarities = []
    for actual_genres, pred_genres in zip(y_actual, y_pred):
        actual_embeddings = get_genres_embeddings(actual_genres)
        pred_embeddings = get_genres_embeddings(pred_genres)

        # Calculate pairwise cosine similarity
        if actual_embeddings and pred_embeddings:
            similarity_matrix = cosine_similarity(actual_embeddings, pred_embeddings)
            avg_similarity = np.mean(similarity_matrix)
        else:
            avg_similarity = 0.0
        similarities.append(avg_similarity)
    return similarities

# Calculate cosine similarities between the text vectors of actual and predicted genres
ytest_list = listpadding(multilabel_binarizer.inverse_transform(ytest))
ypred_list = listpadding(multilabel_binarizer.inverse_transform(y_pred_new))


cosine_similarities = calculate_cosine_similarities(multilabel_binarizer.inverse_transform(ytest)[0:51],
                                                  multilabel_binarizer.inverse_transform(y_pred_new)[0:51])

# Print the cosine similarities
for idx, similarity in enumerate(cosine_similarities):
    print(f"Document {idx + 1}: Cosine Similarity = {similarity:.4f}")

Document 1: Cosine Similarity = 0.7092
Document 2: Cosine Similarity = 0.8438
Document 3: Cosine Similarity = 0.8061
Document 4: Cosine Similarity = 0.8166
Document 5: Cosine Similarity = 0.7843
Document 6: Cosine Similarity = 0.9529
Document 7: Cosine Similarity = 0.9019
Document 8: Cosine Similarity = 0.7886
Document 9: Cosine Similarity = 0.7034
Document 10: Cosine Similarity = 0.8667
Document 11: Cosine Similarity = 0.8435
Document 12: Cosine Similarity = 0.8257
Document 13: Cosine Similarity = 0.8108
Document 14: Cosine Similarity = 0.8122
Document 15: Cosine Similarity = 0.7696
Document 16: Cosine Similarity = 0.6849
Document 17: Cosine Similarity = 0.8981
Document 18: Cosine Similarity = 0.8527
Document 19: Cosine Similarity = 0.7662
Document 20: Cosine Similarity = 0.9123
Document 21: Cosine Similarity = 0.9074
Document 22: Cosine Similarity = 0.8997
Document 23: Cosine Similarity = 0.7690
Document 24: Cosine Similarity = 0.6657
Document 25: Cosine Similarity = 0.7990
Document 

#### Average Cosine Similarities as an Output Metric

In [None]:
def calculate_cosine_similarities(y_actual, y_pred):
    all_similarities = []
    for actual_genres, pred_genres in zip(y_actual, y_pred):
        actual_embeddings = get_genres_embeddings(actual_genres)
        pred_embeddings = get_genres_embeddings(pred_genres)

        # Calculate pairwise cosine similarity
        if actual_embeddings and pred_embeddings:
            similarity_matrix = cosine_similarity(actual_embeddings, pred_embeddings)
            avg_similarity = np.mean(similarity_matrix)
        else:
            avg_similarity = 0.0
        all_similarities.append(avg_similarity)

    # Calculate and return the overall average similarity
    overall_avg_similarity = np.mean(all_similarities)
    return overall_avg_similarity

# Calculate overall cosine similarity
overall_cosine_similarity = calculate_cosine_similarities(multilabel_binarizer.inverse_transform(ytest)[0:1000],
                                                  multilabel_binarizer.inverse_transform(y_pred_new)[0:1000])

# Print the overall cosine similarity
print(f"Overall Cosine Similarity = {overall_cosine_similarity:.4f}")

Overall Cosine Similarity = 0.8183


In [None]:
len(ytest)

1000

Although it's more computationaly intensive, we'll use the pairwise cosine similarities. This measure captured perfect matches ("fiction" and "fiction"), while the null-padded cosine similarities didn't score these the highest.

In [None]:
print(multilabel_binarizer.inverse_transform(y_pred_new)[5])
print(multilabel_binarizer.inverse_transform(ytest)[5])
calculate_cosine_similarities(multilabel_binarizer.inverse_transform(ytest)[5],
                                                  multilabel_binarizer.inverse_transform(y_pred_new)[5])

('Fiction', 'Novel')
('Novel',)


0.84839195

In [None]:
print(multilabel_binarizer.inverse_transform(y_pred_new)[0])
print(multilabel_binarizer.inverse_transform(ytest)[0])
calculate_cosine_similarities(multilabel_binarizer.inverse_transform(ytest)[0],
                                                  multilabel_binarizer.inverse_transform(y_pred_new)[0])

('Fiction', 'Speculative fiction')
("Children's literature",)


0.8306795

In [None]:
print(multilabel_binarizer.inverse_transform(y_pred_new)[30])
print(multilabel_binarizer.inverse_transform(ytest)[30])

('Fiction', 'Novel', 'Speculative fiction')
('Postmodernism',)


In [None]:
print(multilabel_binarizer.inverse_transform(y_pred_new)[50])
print(multilabel_binarizer.inverse_transform(ytest)[50])

('Fiction', 'Novel', 'Speculative fiction')
('Biography', 'Drama', 'Historical romance')
