In [None]:
import numpy as np
import pandas as pd
import json
import tensorflow as tf
from tensorflow import keras
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/My Drive/266/booksummaries.txt'

df = pd.read_csv(file_path, delimiter='\t', encoding='utf-8', header=None)

# Define the column names
df.columns = ['ID', 'MID', 'Title', 'Author', 'Date', 'Genres', 'Summary']

# Function to parse JSON safely
def parse_json_safe(json_str):
    try:
        if isinstance(json_str, str):  # Check if the value is a string
            return json.loads(json_str)
        else:
            return {}  # Return an empty dict if not a string
    except json.JSONDecodeError:
        return {}  # Return an empty dict if JSON decoding fails

# Apply the function to the 'Genres' column
df['Genres'] = df['Genres'].apply(parse_json_safe)

# Function to extract genres from the parsed JSON
def extract_genres(genre_dict):
    if isinstance(genre_dict, dict):
        return list(genre_dict.values())
    return []

# Apply the function to the 'Genres' column to get lists of genres
df['genre_list'] = df['Genres'].apply(extract_genres)

df = df.drop(columns = ['Genres', 'MID'])

df.head(4)


Unnamed: 0,ID,Title,Author,Date,Summary,genre_list
0,620,Animal Farm,George Orwell,1945-08-17,"Old Major, the old boar on the Manor Farm, ca...","[Roman à clef, Satire, Children's literature, ..."
1,843,A Clockwork Orange,Anthony Burgess,1962,"Alex, a teenager living in near-future Englan...","[Science Fiction, Novella, Speculative fiction..."
2,986,The Plague,Albert Camus,1947,The text of The Plague is divided into five p...,"[Existentialism, Fiction, Absurdist fiction, N..."
3,1756,An Enquiry Concerning Human Understanding,David Hume,,The argument of the Enquiry proceeds by a ser...,[]


In [None]:
# Exploratory Analysis
len(df.index)

16559

In [None]:
# Unique genres
flattened_genres = [genre for sublist in df['genre_list'] for genre in sublist]

genre_counts = {genre: flattened_genres.count(genre) for genre in set(flattened_genres)}

# Convert to a DataFrame
genre_counts_df = pd.DataFrame(list(genre_counts.items()), columns=['Genre', 'Count'])

# Sort the DataFrame by count
sorted_genre_counts_df = genre_counts_df.sort_values(by='Count', ascending=False).reset_index(drop=True)

# Print the sorted DataFrame
print(sorted_genre_counts_df)

                   Genre  Count
0                Fiction   4747
1    Speculative fiction   4314
2        Science Fiction   2870
3                  Novel   2463
4                Fantasy   2413
..                   ...    ...
222           Conspiracy      1
223          Pornography      1
224        Space western      1
225      Literary theory      1
226     Bangsian fantasy      1

[227 rows x 2 columns]


In [None]:
sorted_genre_counts_df

Unnamed: 0,Genre,Count
0,Fiction,4747
1,Speculative fiction,4314
2,Science Fiction,2870
3,Novel,2463
4,Fantasy,2413
...,...,...
222,Conspiracy,1
223,Pornography,1
224,Space western,1
225,Literary theory,1


In [None]:
# remove samples with 0 genre tags
df = df[~(df['genre_list'].str.len() == 0)]
df = df[~(df['Summary'].str.len() == 0)]

In [None]:
len(df.index)

12841

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
pd.set_option('display.max_colwidth', 300)

#### 227 distinct genres, With a long tail.

### Sample 5,000 Records so we predict 1,000.

In [None]:
random_seed = 42

# Sample 5000 records with the random seed
df = df.sample(5000, random_state=random_seed)

## Bag of Words Baseline

In [None]:
import nltk
import re
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


In [None]:
# function for text cleaning
def clean_text(text):
    # remove backslash-apostrophe
    text = re.sub("\'", "", text)
    # remove everything alphabets
    text = re.sub("[^a-zA-Z]"," ",text)
    # remove whitespaces
    text = ' '.join(text.split())
    # convert text to lowercase
    text = text.lower()

    return text

In [None]:
df['clean_summary'] = df['Summary'].apply(lambda x: clean_text(x))

In [None]:
df[['Summary', 'clean_summary']].sample(3)

Unnamed: 0,Summary,clean_summary
9657,"Prewytt Brumblydge, inventor of the Brumblitron, must be found in order to disable the device before it destroys the Earth. This is a job for Mr. Bass, but he has disappeared... so the boys pore over his notebook for clues and go spacefaring to find Brumblydge. This time, instead of journeying ...",prewytt brumblydge inventor of the brumblitron must be found in order to disable the device before it destroys the earth this is a job for mr bass but he has disappeared so the boys pore over his notebook for clues and go spacefaring to find brumblydge this time instead of journeying to basidium...
6028,"Tilo, the titular ""Mistress of Spices,"" is a shopkeeper and an immigrant from India who helps customers satisfy their needs and desires with spices. Her life changes when she falls in love with an American man. When she was a little girl she could see what other people couldn't see.",tilo the titular mistress of spices is a shopkeeper and an immigrant from india who helps customers satisfy their needs and desires with spices her life changes when she falls in love with an american man when she was a little girl she could see what other people couldnt see
4678,"The story begins with Bertie finding himself with pink spots about the thorax, so he flies off to E. Jimpson Murgatroyd (the sombre bird of Harley Street who despotted Tipton Plimsoll in Full Moon). After getting mixed with Orlo Porter fleeing from a policeman and a crowd, Bertie is sentenced b...",the story begins with bertie finding himself with pink spots about the thorax so he flies off to e jimpson murgatroyd the sombre bird of harley street who despotted tipton plimsoll in full moon after getting mixed with orlo porter fleeing from a policeman and a crowd bertie is sentenced by the d...


In [None]:
# remove samples with 0 genre tags
df = df[~(df['genre_list'].str.len() == 0)]

## 1.1 Simple Bag-of-Words Baseline

## 1.2 Drop stopwords, which don't contribute to signal

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

df['clean_summary'] = df['clean_summary'].apply(lambda x: remove_stopwords(x))

### Converting Text to Features

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['genre_list'])

# transform target variable
y = multilabel_binarizer.transform(df['genre_list'])

In [None]:
# split dataset into training and validation set
xtrain, xtest, ytrain, ytest = train_test_split(df['clean_summary'], y, test_size=0.2, random_state=9)

In [None]:
# create TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(xtrain)  # Corrected instance method call
X_test_tfidf = vectorizer.transform(xtest)

In [None]:
from sklearn.linear_model import LogisticRegression

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier

# Performance metric
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

In [None]:
# fit model on train data, using training X and Y (Plot summaries and genre vectors)
clf.fit(X_train_tfidf, ytrain)



In [None]:
# make predictions for validation set
y_pred = clf.predict(X_test_tfidf)

In [None]:
# evaluate performance

# Calculate accuracy
accuracy = accuracy_score(ytest, y_pred)
print(f'Accuracy: {accuracy}')

# Calculate precision
precision = precision_score(ytest, y_pred, average='micro')
print(f'Precision: {precision}')

# Calculate recall
recall = recall_score(ytest, y_pred, average='micro')
print(f'Recall: {recall}')

# f1 score
f1_score = f1_score(ytest, y_pred, average="micro")
print(f'F1 Score: {f1_score}')

Accuracy: 0.039
Precision: 0.7707182320441989
Recall: 0.11984536082474227
F1 Score: 0.20743494423791822


In [None]:
y_pred[100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
multilabel_binarizer.inverse_transform(y_pred)[100]

()

In [None]:
decoded_labels = multilabel_binarizer.inverse_transform(y_pred)

# Count the number of records with empty tuples
count_empty = sum(len(labels) == 0 for labels in decoded_labels)
print(count_empty)

749


### Cosine Similarities

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Function to get embeddings for a list of genres
def get_genre_embedding(genre):
    inputs = tokenizer(genre, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

all_genres = list(set(flattened_genres))

# Get embeddings for all unique genres
# genre_embeddings = get_genre_embeddings(all_genres)

def get_genres_embeddings(genres_list):
    return [get_genre_embedding(genre) for genre in genres_list]


# The inverse_transform genre lists are actually 1-D arrays of tuples. We need to convert to 2-D list of lists.
def listpadding(list_o_tuples):
    # Determine the maximum tuple length
    max_len = max(len(t) for t in list_o_tuples)

    # Pad the tuples to have the same length
    padded_genres = [t + ('',) * (max_len - len(t)) for t in list_o_tuples]

    # Convert to a 2-D numpy array
    return np.array(padded_genres)


#### Cosine Similarities w/ Actual Examples

In [None]:
# Compute cosine similarities for each pair of actual and predicted genres
def calculate_cosine_similarities(y_actual, y_pred):
    similarities = []
    for actual_genres, pred_genres in zip(y_actual, y_pred):
        actual_embeddings = get_genres_embeddings(actual_genres)
        pred_embeddings = get_genres_embeddings(pred_genres)

        # Calculate pairwise cosine similarity
        if actual_embeddings and pred_embeddings:
            similarity_matrix = cosine_similarity(actual_embeddings, pred_embeddings)
            avg_similarity = np.mean(similarity_matrix)
        else:
            avg_similarity = 0.0
        similarities.append(avg_similarity)
    return similarities

# Calculate cosine similarities between the text vectors of actual and predicted genres
ytest_list = listpadding(multilabel_binarizer.inverse_transform(ytest))
ypred_list = listpadding(multilabel_binarizer.inverse_transform(y_pred))


cosine_similarities = calculate_cosine_similarities(multilabel_binarizer.inverse_transform(ytest)[0:51],
                                                  multilabel_binarizer.inverse_transform(y_pred)[0:51])

# Print the cosine similarities
for idx, similarity in enumerate(cosine_similarities):
    print(f"Document {idx + 1}: Cosine Similarity = {similarity:.4f}")

Document 1: Cosine Similarity = 0.0000
Document 2: Cosine Similarity = 0.9301
Document 3: Cosine Similarity = 0.0000
Document 4: Cosine Similarity = 0.0000
Document 5: Cosine Similarity = 0.0000
Document 6: Cosine Similarity = 0.0000
Document 7: Cosine Similarity = 0.0000
Document 8: Cosine Similarity = 0.0000
Document 9: Cosine Similarity = 0.0000
Document 10: Cosine Similarity = 0.0000
Document 11: Cosine Similarity = 0.0000
Document 12: Cosine Similarity = 0.0000
Document 13: Cosine Similarity = 0.8313
Document 14: Cosine Similarity = 0.8529
Document 15: Cosine Similarity = 0.0000
Document 16: Cosine Similarity = 0.0000
Document 17: Cosine Similarity = 0.0000
Document 18: Cosine Similarity = 0.0000
Document 19: Cosine Similarity = 0.0000
Document 20: Cosine Similarity = 0.0000
Document 21: Cosine Similarity = 0.9301
Document 22: Cosine Similarity = 0.0000
Document 23: Cosine Similarity = 0.0000
Document 24: Cosine Similarity = 0.0000
Document 25: Cosine Similarity = 0.7847
Document 

#### Average Cosine Similarities as an Output Metric

In [None]:
def calculate_cosine_similarities(y_actual, y_pred):
    all_similarities = []
    for actual_genres, pred_genres in zip(y_actual, y_pred):
        actual_embeddings = get_genres_embeddings(actual_genres)
        pred_embeddings = get_genres_embeddings(pred_genres)

        # Calculate pairwise cosine similarity
        if actual_embeddings and pred_embeddings:
            similarity_matrix = cosine_similarity(actual_embeddings, pred_embeddings)
            avg_similarity = np.mean(similarity_matrix)
        else:
            avg_similarity = 0.0
        all_similarities.append(avg_similarity)

    # Calculate and return the overall average similarity
    overall_avg_similarity = np.mean(all_similarities)
    return overall_avg_similarity

# Calculate overall cosine similarity
overall_cosine_similarity = calculate_cosine_similarities(multilabel_binarizer.inverse_transform(ytest),
                                                  multilabel_binarizer.inverse_transform(y_pred))

# Print the overall cosine similarity
print(f"Overall Cosine Similarity = {overall_cosine_similarity:.4f}")

Overall Cosine Similarity = 0.2161


Although it's more computationaly intensive, we'll use the pairwise cosine similarities. This measure captured perfect matches ("fiction" and "fiction"), while the null-padded cosine similarities didn't score these the highest.

In [None]:
print(multilabel_binarizer.inverse_transform(y_pred)[430])
print(multilabel_binarizer.inverse_transform(ytest)[430])
calculate_cosine_similarities(multilabel_binarizer.inverse_transform(ytest)[430],
                                                  multilabel_binarizer.inverse_transform(y_pred)[430])

()
("Children's literature", 'Historical fiction', 'Historical novel')


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


nan

In [None]:
print(multilabel_binarizer.inverse_transform(y_pred)[48])
print(multilabel_binarizer.inverse_transform(ytest)[48])

()
('Novel',)


In [None]:
print(multilabel_binarizer.inverse_transform(y_pred)[50])
print(multilabel_binarizer.inverse_transform(ytest)[50])

()
('Biography', 'Drama', 'Historical romance')
