In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/reviews_and_metadata.csv')

In [3]:
df.shape

(183215, 18)

In [4]:
df['description'] = df['description'].str.replace(',', '').str.replace('[','').str.replace(']','')
df['description'] = df['description'].str.replace('"', '')
df['description'] = df['description'].str.replace("'", '')

  df['description'] = df['description'].str.replace(',', '').str.replace('[','').str.replace(']','')


**Preprocess and vectorize the title**

In [5]:

import gensim
from gensim.models import Word2Vec


# Preprocess the text by tokenizing and removing stop words
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    return tokens

df['title_tokens'] = df['title'].apply(preprocess_text)

# Train the Word2Vec model on the title tokens
word2vec_model = Word2Vec(df['title_tokens'], min_count=1, vector_size=100)

# Define a function to encode a sequence of tokens as a single vector
def encode_sequence(tokens, model):
    vec = np.zeros(model.vector_size)
    count = 0
    for token in tokens:
        if token in model.wv.key_to_index:
            vec += model.wv.get_vector(token)
            count += 1
    if count > 0:
        vec /= count
    return vec

# Encode the title tokens as vectors using the Word2Vec model
df['title_vectors'] = df['title_tokens'].apply(lambda tokens: encode_sequence(tokens, word2vec_model))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
pd.DataFrame(df['title'].unique())

Unnamed: 0,0
0,Crabtree &amp; Evelyn - Gardener's Ultra-Moist...
1,AHAVA Bath Salts
2,Supersmile Powdered Mouthrinse
3,Supersmile Professional Teeth Whitening Toothp...
4,"Archipelago Morning Mint Body Lotion ,18 Fl Oz"
...,...
2231,Laura Geller New York Brow Gel Pencil
2232,butter LONDON Glazen Eye Gloss
2233,butter LONDON Nail Polish
2234,ORIBE Bright Blonde Shampoo for Beautiful Color


In [7]:
df['title_vectors']

0         [0.09021070785820484, 0.6161754429340363, -0.2...
1         [0.09021070785820484, 0.6161754429340363, -0.2...
2         [0.09021070785820484, 0.6161754429340363, -0.2...
3         [0.09021070785820484, 0.6161754429340363, -0.2...
4         [0.09021070785820484, 0.6161754429340363, -0.2...
                                ...                        
183210    [0.7698620896786451, 0.32980798184871674, 0.09...
183211    [0.7698620896786451, 0.32980798184871674, 0.09...
183212    [0.7698620896786451, 0.32980798184871674, 0.09...
183213    [0.7698620896786451, 0.32980798184871674, 0.09...
183214    [0.7698620896786451, 0.32980798184871674, 0.09...
Name: title_vectors, Length: 183215, dtype: object

In [None]:
# Saving the vectorizer for later use

'''
import pickle
with open('word2vec_model.pkl', 'wb') as f:
    pickle.dump(word2vec_model, f)

'''

"\nimport pickle\nwith open('word2vec_model.pkl', 'wb') as f:\n    pickle.dump(word2vec_model, f)\n\n"

**Label Encoding the reviewer ID column**

In [8]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder
label_encoder = LabelEncoder()

# Fit the label encoder on the user_id column
label_encoder.fit(df['reviewerID'])

# Transform the user_id column into a numerical format
df['reviewerID_encoded'] = label_encoder.transform(df['reviewerID'])



In [None]:
# Saving the label encoder

'''
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

'''

"\nwith open('label_encoder.pkl', 'wb') as f:\n    pickle.dump(label_encoder, f)\n\n"

**Creating a column**

What I am trying to do here is, I am creating a column titled "RecommendFlag" which has binary values 0/1. If the rating of the product is above 4, it means that it is a good product and the RecommendFlag value for that product will be 1. Otherwise it will be 0. This is going to be the target variable for the classification neural network.

In [9]:
# Create a new column called RecommendFlag
df['RecommendFlag'] = (df['overall'] > 4).astype(int)


In [10]:
df['RecommendFlag'].value_counts()

1    125979
0     57236
Name: RecommendFlag, dtype: int64

In [None]:
# Now, I will save this updated dataset for later use

#df.to_csv('reviews_and_metadata_updated.csv')

**Neural Network model**

In [12]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input, concatenate
from keras.models import Model
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError


# Load the dataset
X_title = np.array(df['title_vectors'].tolist())
X_user = np.array(df['reviewerID_encoded'])

# Normalize the user input data
scaler = StandardScaler()
X_user = scaler.fit_transform(X_user.reshape(-1, 1))

y = np.array(df['RecommendFlag'])

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_title_train, X_title_test, X_user_train, X_user_test, y_train, y_test = train_test_split(X_title, X_user, y, test_size=0.2)


# Define the neural network architecture
input_title = Input(shape=(X_title.shape[1],))
input_user = Input(shape=(1,))
x1 = Dense(64, activation='relu')(input_title)
x2 = Dense(64, activation='relu')(input_user)
merged = concatenate([x1, x2])
merged = Dropout(0.5)(merged)
output = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[input_title, input_user], outputs=output)

# Compile the model
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.compile(optimizer='adam', loss=MeanSquaredError(), metrics=[RootMeanSquaredError()])

# Train the model
model.fit([X_title_train, X_user_train],y_train, epochs=10, batch_size=32)

# Evaluate the model on the testing set
test_loss, test_rmse = model.evaluate([X_title_test, X_user_test], y_test)

print(f'Test rmse: {test_rmse}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test rmse: 0.45719361305236816


In [None]:
# Saving the model for later use

#model.save('neural_recommendation_model.h5')

In [None]:
# Saving the StandardScaler
'''
import pickle
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
'''

"\nimport pickle\nwith open('scaler.pkl', 'wb') as f:\n    pickle.dump(scaler, f)\n"

In [14]:
# Get the title vectors for all items
all_title_vectors = np.array(df['title_vectors'].tolist())


# Remove duplicates from the dataframe
df = df.drop_duplicates(subset='title')
df.reset_index(drop=True)

# Normalize the user ID for the particular user_id
user_id = 'A2UXFNW9RTL4VM'  
user_id_encoded = label_encoder.transform([user_id])
normalized_user_id = scaler.transform([user_id_encoded])
normalized_user_id = np.repeat(normalized_user_id, all_title_vectors.shape[0], axis=0)


# Get the predicted recommendation score for each item for the given user_id
predicted_scores = model.predict([all_title_vectors, normalized_user_id])



# Sort the items based on their recommendation scores
sorted_indices = np.argsort(predicted_scores, axis=0)[::-1].flatten()


#print(sorted_indices)


# Display the top N recommended items for the user

N = 20
recommended_indices = sorted_indices[:N]
#print(recommended_indices)
print("Product Recommendations for User: ", user_id)
print()
for i in recommended_indices:
    print(df.iloc[i]['title'])

 
      


Product Recommendations for User:  A2UXFNW9RTL4VM

SEXYHAIR Style Hard Up Hard Holding Gel
CHI Argan Oil
Crabtree &amp; Evelyn Ultra-Moisturising Hand Cream Therapy, Evelyn Rose - 3.5 oz
Noodle &amp; Boo Soothing Baby Body Wash for Gentle Baby Care
Crabtree &amp; Evelyn Conditioning Hand Wash
L'Occitane Moisturizing Hand Lotion
Crabtree &amp; Evelyn Shave Cream
Anthony Shave Cream
Crabtree &amp; Evelyn Ultra-Moisturising Hand Therapy,Lavender,8.8 oz.
AG Hair Smooth The Oil Smoothing Oil
Noodle &amp; Boo Super Soft Moisturizing Lotion for Daily Baby Care, Sensitive Skin and Hypoallergenic
CHI Argan Oil Plus Moringa Oil Shampoo
Eau Thermale Av&egrave;ne Micellar Lotion
Epicuren Discovery After Bath Body Moisturizer
Crabtree &amp; Evelyn Moisturising Body Lotion
Mustela Bath Oil, Gentle Baby Bath Oil with Natural Avocado Oil, for Dry Skin
SABON Butter Cream
Epicuren Discovery Micro-Derm Ultra-Refining Scrub
Mustela Baby Oil, Moisturizing Oil for Baby Massage, Natural Avocado Oil, Pomegran