<a href="https://colab.research.google.com/github/Shimwoshili/chatbot/blob/main/tololi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats

# Visualization
import seaborn as sns

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
product = pd.read_csv("shoes.csv")
product.head()

Unnamed: 0,UserId,ShoeId,Rating
0,1,1,4
1,2,3,4
2,3,3,3
3,4,4,5
4,5,5,5


In [16]:
# Get the dataset information
product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   UserId  19 non-null     int64
 1   ShoeId  19 non-null     int64
 2   Rating  19 non-null     int64
dtypes: int64(3)
memory usage: 584.0 bytes


In [17]:
# Number of users
print('The product dataset has', product['UserId'].nunique(), 'unique users')
# Number of product
print('The product dataset has', product['ShoeId'].nunique(), 'unique product')
# Number of ratings
print('The product dataset has', product['Rating'].nunique(), 'unique ratings')
# List of unique ratings
print('The unique product are', sorted(product['Rating'].unique()))

The product dataset has 19 unique users
The product dataset has 18 unique product
The product dataset has 4 unique ratings
The unique product are [2, 3, 4, 5]


In [20]:
# Read in data
shoe = pd.read_csv('/content/shoe.csv')
# Take a look at the data
shoe.head()

Unnamed: 0,ShoeId,name
0,1,All star
1,2,vanze
2,3,air force
3,4,crocks
4,5,air max


In [22]:
# Merge product and shoe datasets
df = pd.merge(product, shoe, on='ShoeId', how='inner')
# Take a look at the data
df.head()

Unnamed: 0,UserId,ShoeId,Rating,name
0,1,1,4,All star
1,2,3,4,air force
2,3,3,3,air force
3,4,4,5,crocks
4,5,5,5,air max


In [31]:
# Aggregate by shoe
agg_ratings = df.groupby('name').agg(mean_rating = ('Rating', 'mean'),
                                                number_of_ratings = ('Rating', 'count')).reset_index()
# Keep the movies with over 1 ratings
agg_ratings_GT1 = agg_ratings[agg_ratings['number_of_ratings']>1]
# Check the information of the dataframe
agg_ratings_GT1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 2 to 2
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               1 non-null      object 
 1   mean_rating        1 non-null      float64
 2   number_of_ratings  1 non-null      int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 32.0+ bytes


In [32]:
# Check popular shoes
agg_ratings_GT1.sort_values(by='number_of_ratings', ascending=False).head()

Unnamed: 0,name,mean_rating,number_of_ratings
2,air force,3.5,2


In [35]:
# Merge data
df_GT1 = pd.merge(df, agg_ratings_GT1[['name']], on='name', how='inner')
df_GT1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2 entries, 0 to 1
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   UserId  2 non-null      int64 
 1   ShoeId  2 non-null      int64 
 2   Rating  2 non-null      int64 
 3   name    2 non-null      object
dtypes: int64(3), object(1)
memory usage: 80.0+ bytes


In [37]:
# Number of users
print('The ratings dataset has', df_GT1['UserId'].nunique(), 'unique users')
# Number of movies
print('The ratings dataset has', df_GT1['ShoeId'].nunique(), 'unique shoe')
# Number of ratings
print('The ratings dataset has', df_GT1['Rating'].nunique(), 'unique ratings')
# List of unique ratings
print('The unique ratings are', sorted(df_GT1['Rating'].unique()))

The ratings dataset has 2 unique users
The ratings dataset has 1 unique shoe
The ratings dataset has 2 unique ratings
The unique ratings are [3, 4]


In [38]:
# Create user-item matrix
matrix = df_GT1.pivot_table(index='name', columns='UserId', values='Rating')
matrix.head()

UserId,2,3
name,Unnamed: 1_level_1,Unnamed: 2_level_1
air force,4,3


In [39]:
# Normalize user-item matrix
matrix_norm = matrix.subtract(matrix.mean(axis=1), axis = 0)
matrix_norm.head()

UserId,2,3
name,Unnamed: 1_level_1,Unnamed: 2_level_1
air force,0.5,-0.5


In [40]:
# Item similarity matrix using Pearson correlation
item_similarity = matrix_norm.T.corr()
item_similarity.head()

name,air force
name,Unnamed: 1_level_1
air force,1.0


In [41]:
# Item similarity matrix using cosine similarity
item_similarity_cosine = cosine_similarity(matrix_norm.fillna(0))
item_similarity_cosine

array([[1.]])

In [None]:
# Pick a user ID
picked_userId = 1
# Pick a shoe
picked_shoe= 'All star'
# Shoes that the target user has bought
picked_userId_bought = pd.DataFrame(matrix_norm[picked_userId].dropna(axis=0, how='all')\
                          .sort_values(ascending=False))\
                          .reset_index()\
                          .rename(columns={1:'Rating'})
picked_userId_bought.head()

In [None]:
# Calculate the predicted rating using weighted average of similarity scores and the ratings from user 1
predicted_rating = round(np.average(picked_userid_bought_similarity['Rating'], 
                                    weights=picked_userid_bought_similarity['similarity_score']), 6)
print(f'The predicted rating for {picked_userid} by user {picked_userid} is {predicted_rating}' )
The predicted rating for  by user 1 is 0.338739

In [None]:
# Item-based recommendation function
def item_based_rec(picked_userid=1, number_of_similar_items=5, number_of_recommendations =3):
  import operator
  # shoes that the target user has not bought
  picked_userid_unbought = pd.DataFrame(matrix_norm[picked_userid].isna()).reset_index()
  picked_userid_unbought = picked_userid_unbought[picked_userid_unbought[1]==True]['name'].values.tolist()
  # Movies that the target user has watched
  picked_userid_bought = pd.DataFrame(matrix_norm[picked_userid].dropna(axis=0, how='all')\
                            .sort_values(ascending=False))\
                            .reset_index()\
                            .rename(columns={1:'Rating'})
  
  # Dictionary to save the unbought shoe and predicted rating pair
  rating_prediction ={}  
  # Loop through unbought shoe          
  for picked_movie in picked_userid_unbought: 
    # Calculate the similarity score of the picked shoe with other shoes 
    picked_shoe_similarity_score = item_similarity[[picked_shoe]].reset_index().rename(columns={picked_shoe:'similarity_score'})
    # Rank the similarities between the picked user watched movie and the picked unbought shoe.
    picked_userid_shoe_similarity = pd.merge(left=picked_userid_bought, 
                                                right=picked_shoe_similarity_score, 
                                                on='name', 
                                                how='inner')\
                                        .sort_values('similarity_score', ascending=False)[:number_of_similar_items]
    # Calculate the predicted rating using weighted average of similarity scores and the ratings from user 1
    predicted_rating = round(np.average(picked_userid_shoe_similarity['Rating'], 
                                        weights=picked_userid_bought_similarity['similarity_score']), 6)
    # Save the predicted rating in the dictionary
    rating_prediction[picked_shoe] = predicted_rating
    # Return the top recommended shoe
  return sorted(rating_prediction.items(), key=operator.itemgetter(1), reverse=True)[:number_of_recommendations]
# Get recommendations
recommended_shoe = item_based_rec(picked_userid=1, number_of_similar_items=5, number_of_recommendations =3)
recommended_shoe