In [2]:
# Import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load the csv file
review_reco = pd.read_csv('/content/drive/MyDrive/Recommendation System Project/sample30.csv')
review_reco.shape


In [None]:
# Print the columns
review_reco.columns

In [None]:
# Check for missing values

review_reco.isnull().sum()

In [None]:
# Drop the missing values for reviews_username
review_reco = review_reco.dropna(subset=['reviews_username'])
review_reco.shape


In [None]:
# Divide the data into train and test

train, test = train_test_split(review_reco, test_size=0.30, random_state=31)
print(train.shape)
print(test.shape)


### User and User recommedation system

In [9]:
# Create a matrix of reviews where the rows are the users and the columns are the products and the values are the ratings
df_pivot = train.pivot_table(
    index='reviews_username',
    columns='id',
    values='reviews_rating'
).fillna(0)

In [None]:
df_pivot.sample(10)

#### Creating dummy train & dummy test dataset
These dataset will be used for prediction
- Dummy train will be used later for prediction of the movies which has not been rated by the user. To ignore the movies rated by the user, we will mark it as 0 during prediction. The movies not rated by user is marked as 1 for prediction in dummy train dataset.

- Dummy test will be used for evaluation. To evaluate, we will only make prediction on the movies rated by the user. So, this is marked as 1. This is just opposite of dummy_train.

In [11]:
# Copy the train dataset into dummy_train
dummy_train = train.copy()

In [12]:
# The products not rated by user is marked as 1 for prediction.
dummy_train['reviews_rating'] = dummy_train['reviews_rating'].apply(lambda x: 0 if x>=1 else 1)


In [None]:
# Convert the dummy train dataset into matrix format

dummy_train = dummy_train.pivot_table(
    index='reviews_username',
    columns='id',
    values='reviews_rating'

).fillna(1)
dummy_train.sample(3)

**Cosine Similarity**

Cosine Similarity is a measurement that quantifies the similarity between two vectors [Which is Rating Vector in this case]

**Adjusted Cosine**

Adjusted cosine similarity is a modified version of vector-based similarity where we incorporate the fact that different users have different ratings schemes. In other words, some users might rate items highly in general, and others might give items lower ratings as a preference. To handle this nature from rating given by user , we subtract average ratings for each user from each user's rating for different movies.



In [14]:
# Use adjusted cosimilarity to calculate the similarity between the users
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation



In [15]:
# Create user product matrix

df_pivot = train.pivot_table(
    index='reviews_username',
    columns='id',
    values='reviews_rating'
)

In [None]:
df_pivot.iloc[0].sort_values(ascending=False)

In [None]:
# Normalising the rating of the product for each user around 0 mean
mean = np.nanmean(df_pivot, axis=1)
df_subtracted = (df_pivot.T-mean).T

df_subtracted.sample(3)

In [None]:
# Calculate the cosine similarity between the users on df_subtracted
user_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
user_correlation

#### Prediction User User

Perform prediction based on the corelation matrix

In [None]:
# Prediction of the ratings based on the user correlation , only cosiders the positive correlation

user_correlation[user_correlation<0]=0
user_correlation.shape

In [None]:
# Now we have 2 metrices, one is the postive correlation between users and the other is the user product matrix
# We will use these 2 metrices to predict the ratings
# If we perform a dot product between these 2 metrices, we will get the predicted ratings for the products

user_predicted_ratings = np.dot(user_correlation, df_pivot.fillna(0))
user_predicted_ratings

In [None]:
user_predicted_ratings.shape

In [None]:
# user_predicted_ratings is the predicted ratings for the products by the users based on the user correlation
# But we need to consider only the products which are not rated by the user
# So we will multiply the predicted ratings with the dummy_train matrix

user_final_rating = np.multiply(user_predicted_ratings,dummy_train)
user_final_rating.head()

#### Finding the Top 20 products that a user is most likely to purchase based on the ratings (user-user based recommendation)


In [None]:
# Top 20 products for a random  reviews_username



top20_recommendation = user_final_rating.iloc[23].sort_values(ascending=False).head(20)

# Merge the top 20 products with the original dataset to get the product names
top20_recommendation = pd.merge(top20_recommendation, review_reco[['id', 'name']], on='id', how='left')
top20_recommendation.drop_duplicates(subset='id', keep='first', inplace=True)
top20_recommendation


#### Evaluation - user iser recommendation

Here we will use the products already rated by the user

In [None]:
# Commmon users in train and test dataset
common = test[test.reviews_username.isin(train.reviews_username)]
common.shape

In [25]:
# Convert the common users into matrix format
common_user_based_matrix = common.pivot_table(index='reviews_username', columns='id', values='reviews_rating')

In [None]:
common_user_based_matrix.head()

In [None]:
# user_correlation as data frame

user_correlation_df = pd.DataFrame(user_correlation)
user_correlation_df.head()

In [None]:
# Replace the ids with user name to get the user correlation for the common users
user_correlation_df['reviews_username'] = df_pivot.index
user_correlation_df.set_index('reviews_username', inplace=True)
user_correlation_df.head()


In [None]:
# now replace the ids with user name in the common user based matrix

user_correlation_df.columns = df_pivot.index.tolist()
user_correlation_df.head

In [30]:
common_user_names = common.reviews_username.tolist()

In [None]:
# Get the user correlation for the common users
common_user_correlation_df = user_correlation_df[user_correlation_df.index.isin(common_user_names)]
common_user_correlation_df.shape


In [None]:
common_user_correlation_df1 = common_user_correlation_df.T[common_user_correlation_df.T.index.isin(common_user_names)]
common_user_correlation_df2 = common_user_correlation_df1.T
common_user_correlation_df2.head()

In [None]:
common_user_correlation_df2.shape

In [34]:
# common_user_based_matrix - replace the NAN values with 0
common_user_based_matrix.fillna(0, inplace=True)



In [None]:
# Now we will multiply the common user based matrix with the user correlation for the common users
# This will give us the predicted ratings for the products by the common users

common_user_correlation_df2[common_user_correlation_df2<0] = 0
common_user_rating =  np.dot(common_user_correlation_df2,common_user_based_matrix.fillna(0))
common_user_rating

In [None]:
dummy_test = common.copy()
dummy_test['reviews_rating'] =dummy_test['reviews_rating'].apply(lambda x: 1 if x>=1 else 0)
dummy_test = pd.pivot_table(index='reviews_username',
                            columns='name',
                            values='reviews_rating',data=dummy_test).fillna(0)
dummy_test

In [None]:
common_user_pred_ratings =  np.multiply(common_user_rating,dummy_test)
common_user_pred_ratings

Calculating the RMSE for only the products rated by user.

For RMSE, normalising the rating to (1,5) range.

In [None]:
from sklearn.preprocessing import MinMaxScaler


X  = common_user_pred_ratings.copy()
X = X[X>0]

scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))

print(y)

In [39]:
common_ = common.pivot_table(index='reviews_username', columns='id', values='reviews_rating')


In [40]:
# Finding total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(y))

In [None]:
from numpy import *

rmse = (sum(sum((common_ - y )**2))/total_non_nan)**0.5
print(rmse)

In [42]:
# We must store the model uisng pickle
import pickle
filename = '/content/drive/MyDrive/Recommendation System Project/Pickle/user_user_recommendation_model.sav'
pickle.dump(user_final_rating, open(filename, 'wb'))



### Fine-Tuning the Recommendation System and Recommendation of Top 5 Products
---
Predicted the sentiment (positive or negative) of all the reviews in the train data set of the top 20 recommended products for a user. For each of the 20 products recommended, found the percentage of positive sentiments for all the reviews of each product. Filtered out the top 5 products with the highest percentage of positive reviews