# Book Review Model

## Setup

In [None]:
import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination
import numpy as np
from sklearn.model_selection import train_test_split

#Come from GenAI to troubleshoot issue with numpy
if not hasattr(np, 'product'):
    np.product = np.prod

ratings_df = pd.read_csv('Books_rating.csv')
data_df = pd.read_csv('books_data.csv')
df = pd.merge(ratings_df, data_df, on='Title')
df = df.sample(n=1000, random_state=42)

print(df.columns.tolist())

['Id', 'Title', 'Price', 'User_id', 'profileName', 'review/helpfulness', 'review/score', 'review/time', 'review/summary', 'review/text', 'description', 'authors', 'image', 'previewLink', 'publisher', 'publishedDate', 'infoLink', 'categories', 'ratingsCount']


In [94]:
print(df.shape)

(1000, 19)


## Column processing

In [95]:
cols = ['Book_Popularity', 'Rating_Prediction', 'Review_Length', 'Sentiment_Score', 'Time_Factor', 'Review_Helpfulness']

df['Book_Popularity'] = pd.qcut(df['ratingsCount'].fillna(0), q=2, labels=['Low', 'High'], duplicates='drop')
df['Book_Popularity'] = pd.Categorical(df['Book_Popularity'], categories=['Low', 'High'])

df['Rating_Prediction'] = df['review/score'].astype(int).astype(str)
df['Rating_Prediction'] = pd.Categorical(df['Rating_Prediction'], categories=['1','2','3','4','5'])

df['Review_Length'] = pd.cut(df['review/text'].str.split().str.len(), bins=[0, 50, 150, np.inf], labels=['Short', 'Medium', 'Long'])

df['Sentiment_Score'] = df['review/text'].str.lower().apply(lambda x: 'Positive' if 'good' in x or 'great' in x else ('Negative' if 'bad' in x else 'Neutral'))

df['review/time'] = pd.to_datetime(df['review/time'], unit='s', errors='coerce')
df['Time_Factor'] = pd.cut((pd.Timestamp.today() - df['review/time']).dt.days, bins=[0, 365, np.inf], labels=['Recent', 'Old'])

def helpful_parser(x):
    try:
        num, den = map(int, str(x).split('/'))
        return 'Helpful' if num > 0 else 'Not_Helpful'
    except:
        return 'Not_Helpful'

df['Review_Helpfulness'] = df['review/helpfulness'].apply(helpful_parser)

df.dropna(subset=cols, inplace=True)

In [96]:
df, test_df = train_test_split(df, test_size=0.1, random_state=42)

In [98]:
print(test_df.shape)
print(df.shape)

(100, 25)
(900, 25)


In [99]:
observed_edges = [
    ('Book_Popularity', 'Review_Helpfulness'),
    ('Review_Helpfulness', 'Rating_Prediction'),
    ('Review_Length', 'Rating_Prediction'),
    ('Sentiment_Score', 'Review_Helpfulness'),
    ('Time_Factor', 'Sentiment_Score')
]
observed_model = BayesianNetwork(observed_edges)
observed_cols = list(set(sum(observed_edges, ())))
df_model = df[observed_cols].dropna()
observed_model.fit(df_model, estimator=MaximumLikelihoodEstimator)

  data.groupby([variable] + parents).size().unstack(parents)
  data.groupby([variable] + parents).size().unstack(parents)


In [101]:
#shows structure and cpts of the model
print("Model nodes:", observed_model.nodes())
print("Model edges:", observed_model.edges())

print("\nCPTs:")
for cpd in observed_model.get_cpds():
    print(cpd)

Model nodes: ['Book_Popularity', 'Review_Helpfulness', 'Rating_Prediction', 'Review_Length', 'Sentiment_Score', 'Time_Factor']
Model edges: [('Book_Popularity', 'Review_Helpfulness'), ('Review_Helpfulness', 'Rating_Prediction'), ('Review_Length', 'Rating_Prediction'), ('Sentiment_Score', 'Review_Helpfulness'), ('Time_Factor', 'Sentiment_Score')]

CPTs:
+-----------------------+----------+
| Book_Popularity(High) | 0.484444 |
+-----------------------+----------+
| Book_Popularity(Low)  | 0.515556 |
+-----------------------+----------+
+---------------------------------+-----+---------------------------+
| Book_Popularity                 | ... | Book_Popularity(Low)      |
+---------------------------------+-----+---------------------------+
| Sentiment_Score                 | ... | Sentiment_Score(Positive) |
+---------------------------------+-----+---------------------------+
| Review_Helpfulness(Helpful)     | ... | 0.7035175879396985        |
+---------------------------------+-----

In [None]:
#inference is ready to be used now
inference = VariableElimination(observed_model)

#no evidence
marginal_rating = inference.query(variables=['Rating_Prediction'])
print("No Evidence:")
print(marginal_rating)


#quick test with some evidence 
print("Test:")
test = inference.query(variables=['Rating_Prediction'], evidence={
    'Book_Popularity': 'Low',
    'Review_Length': 'Short',
    'Time_Factor': 'Old',  
    'Sentiment_Score': 'Negative',
    'Review_Helpfulness': 'Not_Helpful'
})
print(test)
#Note that it shows that the rating will likely be 4 or 5 stars

Finding Elimination Order: : 100%|██████████| 5/5 [00:00<00:00, 6551.55it/s]
Eliminating: Time_Factor: 100%|██████████| 5/5 [00:00<00:00, 1332.96it/s]


No Evidence:
+----------------------+--------------------------+
| Rating_Prediction    |   phi(Rating_Prediction) |
| Rating_Prediction(1) |                   0.0733 |
+----------------------+--------------------------+
| Rating_Prediction(2) |                   0.0572 |
+----------------------+--------------------------+
| Rating_Prediction(3) |                   0.0807 |
+----------------------+--------------------------+
| Rating_Prediction(4) |                   0.2149 |
+----------------------+--------------------------+
| Rating_Prediction(5) |                   0.5739 |
+----------------------+--------------------------+
Test:


Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]

+----------------------+--------------------------+
| Rating_Prediction    |   phi(Rating_Prediction) |
| Rating_Prediction(1) |                   0.0483 |
+----------------------+--------------------------+
| Rating_Prediction(2) |                   0.0276 |
+----------------------+--------------------------+
| Rating_Prediction(3) |                   0.0552 |
+----------------------+--------------------------+
| Rating_Prediction(4) |                   0.2690 |
+----------------------+--------------------------+
| Rating_Prediction(5) |                   0.6000 |
+----------------------+--------------------------+





In [None]:
correct = 0
total = 0

for _ , row in test_df.iterrows():
    evidence = {
        'Book_Popularity': row['Book_Popularity'],
        'Review_Length': row['Review_Length'],
        'Sentiment_Score': row['Sentiment_Score'],
        'Time_Factor': row['Time_Factor'],
        'Review_Helpfulness': row['Review_Helpfulness']
    }
    try:
        query_result = inference.query(variables=['Rating_Prediction'], evidence=evidence)
        predicted = query_result.values.argmax() + 1  # since '1'-'5' ratings
        actual = int(row['Rating_Prediction'])

        if predicted == actual:
            correct += 1
        total += 1
        
    except:
        continue 

accuracy = correct / total
print(f"Accuracy on test set: {accuracy:.2%}")

Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Finding Elimination Order: : : 0it

Accuracy on test set: 64.00%





Test set accuracy of 64%