In [14]:
# #!/bin/bash
# !curl -L -o ~/datasets/amazon-books-reviews.zip\
#   https://www.kaggle.com/api/v1/datasets/download/mohamedbakhet/amazon-books-reviews

# !pip install pandas 
# !which python

# Discover book data

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix, save_npz
from sklearn.preprocessing import LabelEncoder
import pickle
import pandas as pd
import os

In [14]:


# Use raw string (r) or forward slashes for Windows paths
dir_path = r"C:\Users\Admin\Documents\GitHub\Two-stages-recommendation-system\datasets\extracted"


# Construct full file path safely
file_path_rating = os.path.join(dir_path, 'Books_rating.csv')

# Load the data (add error handling)
try:
    df = pd.read_csv(file_path_rating)
    print("Data loaded successfully!")
    
    # Basic info
    print("\nData Info:")
    print(df.info())
    
    # Show first 3 rows
    print("\nFirst 3 rows:")
    print(df.head(3))
    
    # Basic statistics
    print("\nBasic Statistics:")
    print(df.describe(include='all'))
    
except FileNotFoundError:
    print(f"Error: File not found at {file_path_rating}")
except Exception as e:
    print(f"An error occurred: {str(e)}")

Data loaded successfully!

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 10 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Id                  object 
 1   Title               object 
 2   Price               float64
 3   User_id             object 
 4   profileName         object 
 5   review/helpfulness  object 
 6   review/score        float64
 7   review/time         int64  
 8   review/summary      object 
 9   review/text         object 
dtypes: float64(2), int64(1), object(7)
memory usage: 228.9+ MB
None

First 3 rows:
           Id                           Title  Price         User_id  \
0  1882931173  Its Only Art If Its Well Hung!    NaN   AVCGYZL8FQQTD   
1  0826414346        Dr. Seuss: American Icon    NaN  A30TK6U7DNS82R   
2  0826414346        Dr. Seuss: American Icon    NaN  A3UH4UZ4RSVO82   

             profileName review/helpfulness  review/score  review/time  \
0  Jim

# Preprocess for cadidate_generation stage

#### 1. Keep Only Required Columns

In [10]:
df = df[['User_id', 'Id', 'review/score']]
df.head()


Unnamed: 0,User_id,Id,review/score
0,AVCGYZL8FQQTD,1882931173,4.0
1,A30TK6U7DNS82R,826414346,5.0
2,A3UH4UZ4RSVO82,826414346,5.0
3,A2MVUWT453QH61,826414346,4.0
4,A22X4XUPKF66MR,826414346,4.0


#### Split training/testing dataset

In [17]:

# Step 1: Get unique user IDs
unique_users = df['User_id'].unique()

# Step 2: Split users into train/test
train_users, test_users = train_test_split(
    unique_users, test_size=0.2, random_state=42
)

# Step 3: Split the full DataFrame based on user membership
train_df = df[df['User_id'].isin(train_users)].reset_index(drop=True)
test_df = df[df['User_id'].isin(test_users)].reset_index(drop=True)

# Print result stats
print(f"✅ Number of unique users: {len(unique_users)}")
print(f"✅ Train users: {len(train_users)}, interactions: {len(train_df)}")
print(f"✅ Test users: {len(test_users)}, interactions: {len(test_df)}")
print(len(train_df["Id"].unique()))
print(len(test_df["Id"].unique()))
print(len(df["Id"].unique()))




✅ Number of unique users: 1008973
✅ Train users: 807178, interactions: 2515374
✅ Test users: 201795, interactions: 484626
208140
106678
221998


#### Build sparse matrix

In [18]:
def BuildSparseMatrix(df):
    #  Encode user and item IDs
    user_encoder = LabelEncoder()
    item_encoder = LabelEncoder()
    df['user_idx'] = user_encoder.fit_transform(df['User_id'])
    df['item_idx'] = item_encoder.fit_transform(df['Id'])

    # Build sparse matrix
    sparse_matrix = csr_matrix(
        (df['review/score'], (df['user_idx'], df['item_idx']))
    )

    print(f"Sparse matrix shape: {sparse_matrix.shape}")
    print(f"Non-zero entries: {sparse_matrix.nnz}")

    return sparse_matrix, user_encoder, item_encoder

In [19]:
# Ensure the output directory exists
def save_sparse_matrix(sparse_matrix, user_encoder, item_encoder, save_name, output_dir='../datasets/processed'):
    os.makedirs(output_dir, exist_ok=True)

    # Save the matrix and encoders
    with open(os.path.join(output_dir, save_name), 'wb') as f:
        pickle.dump({
            'matrix': sparse_matrix,
            'user_encoder': user_encoder,
            'item_encoder': item_encoder
        }, f)

    print("Sparse matrix and encoders saved.")

sparse_matrix_train, user_encoder_train, item_encoder_train = BuildSparseMatrix(train_df)
save_sparse_matrix(sparse_matrix_train, user_encoder_train, item_encoder_train, 'sparse_matrix_train.pkl')

sparse_matrix_test, user_encoder_test, item_encoder_test = BuildSparseMatrix(test_df)
save_sparse_matrix(sparse_matrix_test, user_encoder_test, item_encoder_test, 'sparse_matrix_test.pkl')

sparse_matrix, user_encoder, item_encoder = BuildSparseMatrix(df)
save_sparse_matrix(sparse_matrix, user_encoder, item_encoder, 'sparse_matrix.pkl')

Sparse matrix shape: (807178, 208140)
Non-zero entries: 2001083
Sparse matrix and encoders saved.
Sparse matrix shape: (201795, 106678)
Non-zero entries: 477350
Sparse matrix and encoders saved.
Sparse matrix shape: (1008973, 221998)
Non-zero entries: 2478433
Sparse matrix and encoders saved.


#### Preprocess review

In [21]:
df.columns

Index(['Id', 'Title', 'Price', 'User_id', 'profileName', 'review/helpfulness',
       'review/score', 'review/time', 'review/summary', 'review/text',
       'user_idx', 'item_idx'],
      dtype='object')

In [23]:
df["review/text"]


0          This is only for Julie Strain fans. It's a col...
1          I don't care much for Dr. Seuss but after read...
2          If people become the books they read and if "t...
3          Theodore Seuss Geisel (1904-1991), aka &quot;D...
4          Philip Nel - Dr. Seuss: American IconThis is b...
                                 ...                        
2999995    This is an extremely difficult book to digest,...
2999996    This is pretty interesting. Collingwood seems ...
2999997    This is a good book but very esoteric. "What i...
2999998    My daughter, a freshman at Indiana University,...
2999999    The guy has a few good ideas but, reader, bewa...
Name: review/text, Length: 3000000, dtype: object

In [25]:
# !pip install spacy

^C


Collecting spacy
  Downloading spacy-3.8.7-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.13-cp311-cp311-win_amd64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.11-cp311-cp311-win_amd64.whl.metadata (8.8 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.10-cp311-cp311-win_amd64.whl.metadata (2.5 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.6-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.5.1-cp311-cp311-win_amd6

In [34]:
dir_path = r"C:\Users\Admin\Documents\GitHub\Two-stages-recommendation-system\datasets\extracted"
# Construct full file path safely
file_path_rating = os.path.join(dir_path, 'books_data.csv')

# Load the data (add error handling)
try:
    df = pd.read_csv(file_path_rating)
    print("Data loaded successfully!")
    
except FileNotFoundError:
    print(f"Error: File not found at {file_path_rating}")
except Exception as e:
    print(f"An error occurred: {str(e)}")

Data loaded successfully!


In [35]:
df.head(3)

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],


'Philip Nel takes a fascinating look into the key aspects of Seuss\'s career - his poetry, politics, art, marketing, and place in the popular imagination." "Nel argues convincingly that Dr. Seuss is one of the most influential poets in America. His nonsense verse, like that of Lewis Carroll and Edward Lear, has changed language itself, giving us new words like "nerd." And Seuss\'s famously loopy artistic style - what Nel terms an "energetic cartoon surrealism" - has been equally important, inspiring artists like filmmaker Tim Burton and illustrator Lane Smith. --from back cover'

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp311-cp311-win_amd64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp311-cp311-win_amd64.whl.metadata (41 kB)
Collecting requests (from transformers)
  Using cached requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting tqdm>=4.27 (from transformers)
  Using cached tqdm-4.67.1-py3-non

In [None]:
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
classifier = pipeline("text-classification", model = "bhadresh-savani/distilbert-base-uncased-emotion")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')  


Device set to use cuda:0


In [None]:
prediction = classifier("today i'm sad")
for key, value in prediction[0].items():
    print(f"{key}: {value}")


label: sadness
score: 0.9983229041099548


In [None]:
from transformers import pipeline

# Initialize emotion classifier
emotion_classifier = pipeline(
    "text-classification",
    model="bhadresh-savani/distilbert-base-uncased-emotion",
    truncation=True  # Automatically handles long texts
)

def detect_emotion(text):
    try:
        result = emotion_classifier(text[:1000])[0]  # Safely truncate
        return result['label']
    except:
        return "unknown"

# Apply to dataframe (first 1000 rows for testing)
df['emotion'] = df['review/text'].head(1000).apply(detect_emotion)

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [4]:
df[['review/text', 'emotion']].head(3)

KeyError: "None of [Index(['review/text', 'emotion'], dtype='object')] are in the [columns]"