# Importing Library

In [1]:
import pandas as pd
from google.colab import drive
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from google.colab import drive
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Importing data

In [2]:
def optimise_data(data):    
    #Optimizing the memory use by down sizing the data types to the requirements
    data['order_id']=data['order_id'].astype('int32')
    data['user_id']=data['user_id'].astype('int32')
    data['order_number']=data['order_number'].astype('int16')
    data['order_dow']=data['order_dow'].astype('int16')
    data['order_hour_of_day']=data['order_hour_of_day'].astype('int16')
    data['days_since_prior_order']=data['days_since_prior_order'].astype('int16')
    data['product_id']=data['product_id'].astype('int32')
    data['add_to_cart_order']=data['add_to_cart_order'].astype('int16')
    data['reordered']=data['reordered'].astype('int16')
    data['aisle_id']=data['aisle_id'].astype('int16')
    data['department_id']=data['department_id'].astype('int16')
    data.drop(['Unnamed: 0','index'],axis=1,inplace=True)    
    return data

In [3]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [119]:
data=pd.read_csv('/content/drive/MyDrive/Tittu_data/final_data.csv')
data=optimise_data(data)

In [120]:
#Transformation
data['user_purchase_count']=data.groupby('user_id')['user_id'].transform('count')
data['product_ordered_count']=data.groupby('product_id')['product_id'].transform('count')

# Model Building

**Collaborative filtering Using Cosine similarity**

In [6]:
#Now we will be filtering the data out with relavant products
#Only those customer who has made min of 250 orders and products which were order a min of x time will be taken for further analysis
df1=data[(data['user_purchase_count']>250)]
df1=df1[(df1['product_ordered_count']>=250)]
#df1=df1[df1['reordered']==1]
df1=df1[['user_id','product_name','reordered']]
df1=df1.set_index('user_id')
#df1['tri']=df1.groupby(['user_id','product_name'])['product_name'].transform('count')
product_based=pd.pivot_table(df1,columns='user_id',index='product_name',values='reordered',aggfunc='mean')
product_based=product_based.fillna(0)
user_based=pd.pivot_table(df1,index='user_id',columns='product_name',values='reordered',aggfunc='mean')
user_based=user_based.fillna(0)
#del(data)
del(df1)

In [7]:
#finding cosin similarity
product_similarity_score=cosine_similarity(product_based)         #Compare each product with other product and find similar product
product_similarity_score.shape
user_similarity_score=cosine_similarity(user_based)         #Compare each user with other users and find similar users
user_similarity_score.shape

(36576, 36576)

In [8]:
#Defining a function which would take a product name and returns suggesion based on the product
def product_recommender(product_name):
  index=np.where(product_based.index==product_name)[0][0]
  similar_items=sorted(list(enumerate(product_similarity_score[index])),key=lambda x:x[1],reverse=True)[1:6]
  name=[]
  for i in similar_items:
    name.append(product_based.index[i[0]])
  return name

In [13]:
product_recommender('0% Fat Free Organic Milk')

['Vitamin D Organic Whole Milk',
 'Organic Skim Milk',
 'Organic 1% Milk',
 'Original Sparkling Seltzer Water Cans',
 '2% Reduced Fat Organic Milk']

In [111]:
#takes user id and returns most frequently purchased products
def users_top_purchase(similar_user1,user_name):
  result=(pd.merge((data[data['user_id']==similar_user1]),(data[data['user_id']==user_name]),how='outer')).sort_values('product_ordered_count',ascending=False)[['product_name','product_ordered_count']]
  result.drop_duplicates(inplace=True)
  return result['product_name'].head(10)

#Defining a function which would take a user name and returns similar users
def user_recommender(user_name):
  index=np.where(user_based.index==user_name)[0][0]
  similar_items=sorted(list(enumerate(user_similarity_score[index])),key=lambda x:x[1],reverse=True)[1:6]
  name=[]
  for i in similar_items:
    name.append(user_based.index[i[0]])
  return users_top_purchase(name[0],user_name)

In [121]:
user_recommender(206154)

441                                  Banana
150                  Bag of Organic Bananas
182                    Organic Strawberries
203                    Organic Baby Spinach
465                    Organic Hass Avocado
192                     Organic Raspberries
191                           Organic Lemon
318              Sparkling Water Grapefruit
181    Organic Large Extra Fancy Fuji Apple
238                       Organic Red Onion
Name: product_name, dtype: object

In [48]:
user_recommender(71)

266                     Banana
253     Bag of Organic Bananas
286       Organic Strawberries
343       Organic Baby Spinach
981       Organic Hass Avocado
288                Large Lemon
287                      Limes
71          Organic Whole Milk
506             Organic Garlic
1013          Organic Zucchini
Name: product_name, dtype: object

In [49]:
user_recommender(17)

655                                   Strawberries
184                            Organic Raspberries
754                         100% Whole Wheat Bread
661                                    Raspberries
660                            Granny Smith Apples
130    Organic Large Brown Grade AA Cage Free Eggs
229                             Honey Nut Cheerios
104                            Reduced Fat 2% Milk
23                                    Orange Juice
668                           Strawberry Preserves
Name: product_name, dtype: object

**Collaborative filtering Using Near Neighbours**

In [50]:
#Product based
table_sparse1=csr_matrix(product_based)  #To consider only value and to avoid 0 to reduce computation time
#Building the model
model=NearestNeighbors(algorithm='brute')
model.fit(table_sparse1)
#Input 
def nn_recommend_product(product_name):
  id=np.where(product_based.index==product_name)[0][0]
  distance,suggestion=model.kneighbors(product_based.iloc[id,:].values.reshape(1,-1),n_neighbors=5)
  name=[]
  for i in range(len(suggestion)):
    name.extend(product_based.index[suggestion[i]])
    return name

In [51]:
nn_recommend_product('0% Fat Blueberry Greek Yogurt')

['0% Fat Blueberry Greek Yogurt',
 'Bamboo Skewers',
 'Organic Whole Bean Coffee',
 'Omeprazole Acid Reducer Tablets',
 'Ground Mustard']

In [56]:
#user based
table_sparse2=csr_matrix(user_based)  #To consider only value and to avoid 0 to reduce computation time
#Building the model
model=NearestNeighbors(algorithm='brute')
model.fit(table_sparse2)
#Input 
def nn_recommend_user(user_name):
  id=np.where(user_based.index==user_name)[0][0]
  distance,suggestion=model.kneighbors(user_based.iloc[id,:].values.reshape(1,-1),n_neighbors=5)
  name=[]
  for i in range(len(suggestion)):
    name.extend(user_based.index[suggestion[i]])
    return users_top_purchase(name[0],user_name)

In [57]:
nn_recommend_user(17)

129                   Strawberries
236         100% Whole Wheat Bread
143                    Raspberries
139            Granny Smith Apples
149           Strawberry Preserves
9                             Cola
179    Smoked Turkey Breast Slices
159         Squeeze Tomato Ketchup
296                     Chardonnay
265                          Vodka
Name: product_name, dtype: object

**Popularity Based Model**

In [61]:
#Recommends top 10 popular products
data['Product_reorder_count']=data.groupby('product_name')['reordered'].transform('sum')
data['reorder_ratio']=(data['Product_reorder_count']/(len(data['reordered']==1)))
popular_products=data[['product_id','product_name','product_ordered_count','Product_reorder_count','reorder_ratio','department']]
popular_products=popular_products.drop_duplicates().reset_index(drop=bool)
popular_products['weight']=popular_products['product_ordered_count']*popular_products['reorder_ratio']
top_10_popular=popular_products.sort_values('weight',ascending=False)['product_name'].head(10).values
top_10_popular

array(['Banana', 'Bag of Organic Bananas', 'Organic Strawberries',
       'Organic Baby Spinach', 'Organic Hass Avocado', 'Organic Avocado',
       'Large Lemon', 'Organic Whole Milk', 'Organic Raspberries',
       'Strawberries'], dtype=object)

**Model Based Methods**

In [62]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3095473 sha256=3e6c57b5ad5b2626e159999b57871fe9e128402b334ff801f41ecaf2963ddd99
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.

In [68]:
import surprise
from sklearn.preprocessing import MinMaxScaler 

In [64]:
df1=data[['product_name','user_id','reordered']]

In [67]:
df1['product_ordered_per_user']=df1.groupby(['user_id'])['product_name'].transform('count')
df1['product_per_user']=df1.groupby(['user_id','product_name'])['product_name'].transform('count')
df1['product_ratio']=df1['product_per_user']/df1['product_ordered_per_user']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['product_ordered_per_user']=df1.groupby(['user_id'])['product_name'].transform('count')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['product_per_user']=df1.groupby(['user_id','product_name'])['product_name'].transform('count')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['product_r

In [69]:
scaler=MinMaxScaler(feature_range=(0,5))
df1['scaled']=scaler.fit_transform(df1['product_ratio'].values.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['scaled']=scaler.fit_transform(df1['product_ratio'].values.reshape(-1,1))


In [70]:
df1['rating']=np.round(df1['scaled'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['rating']=np.round(df1['scaled'])


In [71]:
df1['rating']=df1['rating'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['rating']=df1['rating'].astype('int')


In [73]:
from surprise import Dataset
from surprise import Reader
reader = Reader(rating_scale=(0, 5))
data_set= Dataset.load_from_df(df1[['user_id', 'product_name', 'rating']], reader)

In [74]:
from surprise import SVD
from surprise.model_selection import cross_validate

svd = SVD(verbose=True, n_epochs=10)
cross_validate(svd, data_set, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.2463  0.2450  0.2498  0.2471  0.0020  
MAE (testset)     0.1329  0.1303  0.1329  0.1320  0.0012  
Fit time          194.89  214.32  214.18  207.80  9.13    
Test time         186.10  157.26  159.20  167.52  13.16   


{'test_rmse': array([0.24630081, 0.24504905, 0.24981076]),
 'test_mae': array([0.13287172, 0.1303328 , 0.13289897]),
 'fit_time': (194.88934683799744, 214.32352662086487, 214.17640376091003),
 'test_time': (186.10062384605408, 157.25913381576538, 159.20393824577332)}

In [75]:
import string # special operations on strings
import spacy # language models

In [77]:
#Sentiment analysis
afinn = pd.read_csv('/content/drive/MyDrive/Tittu_data/Afinn.csv', sep=',', encoding='latin-1')
afinn.shape
affinity_scores = afinn.set_index('word')['value'].to_dict()

In [78]:
nlp = spacy.load('en_core_web_sm')
sentiment_lexicon = affinity_scores

def calculate_sentiment(text: str = None):
    sent_score = 0
    if text:
        sentence = nlp(text)
        for word in sentence:
            sent_score += sentiment_lexicon.get(word.lemma_, 0)
    return sent_score

In [129]:
def update_recommendation():
  print('\n')
  feed_score=calculate_sentiment(str(input('Please Enter Feedback of the recommendations: ')))
  while(feed_score<0):
    print(top_10_popular)
    print('\n')
    feed_score=calculate_sentiment(str(input('Please Enter Feedback of the recommendations: ')))
    print('\n')
  print('Thank you for Your feed back')

In [140]:
#Final Recommendation System
def recommendation_system():
  print('Select Type of recommendation')
  print('Select 1 for product based,2 for user based,3 for popularity based recommendation system \n')
  user_selection=int(input('Enter Selection here: '))
  if user_selection==1:
    product_name=str(input('Please Enter the Product Name: '))
    try:
      output=product_recommender(product_name)
    except:
      output=top_10_popular
    print(output)
    update_recommendation()
  elif user_selection==2:
    user_id=int(input('Please Enter the User Id: '))
    print('\n')
    try:
      output=nn_recommend_user(user_id)
    except:
      output=top_10_popular
    print('\n')
    print(output)
    update_recommendation()
  elif user_selection==3:
    print(top_10_popular)

  else:
    print('Wrong Selections')

In [142]:
recommendation_system()

Select Type of recommendation
Select 1 for product based,2 for user based,3 for popularity based recommendation system 

Enter Selection here: 3
['Banana' 'Bag of Organic Bananas' 'Organic Strawberries'
 'Organic Baby Spinach' 'Organic Hass Avocado' 'Organic Avocado'
 'Large Lemon' 'Organic Whole Milk' 'Organic Raspberries' 'Strawberries']


In [104]:
user_based.index

Int64Index([    17,     27,     31,     50,     54,     63,     71,     75,
                86,     90,
            ...
            206154, 206165, 206174, 206187, 206193, 206199, 206200, 206201,
            206206, 206208],
           dtype='int64', name='user_id', length=36576)

In [9]:
product_based.index

Index(['#2 Coffee Filters', '0% Fat Blueberry Greek Yogurt',
       '0% Fat Free Organic Milk', '0% Fat Organic Greek Vanilla Yogurt',
       '0% Fat Strawberry Greek Yogurt', '0% Fat Superfruits Greek Yogurt',
       '0% Greek Strained Yogurt',
       '0% Greek Yogurt Black Cherry on the Bottom',
       '0% Greek, Blueberry on the Bottom Yogurt', '1 % Lowfat Milk',
       ...
       'Zucchini Squash', '\"Mokaccino\" Milk + Blue Bottle Coffee Chocolate',
       'for Tots Apple Juice', 'gel hand wash sea minerals',
       'of Hanover 100 Calorie Pretzels Mini',
       'smartwater® Electrolyte Enhanced Water',
       'vitaminwater® XXX Acai Blueberry Pomegranate',
       'with Crispy Almonds Cereal', 'with Olive Oil Mayonnaise',
       'with Olive Oil Mayonnaise Dressing'],
      dtype='object', name='product_name', length=12522)