# Recommendation system with TF-IDF Model
# Table of contents
- [1 - Packages](#1)
- [2 - Load and Preprocess data](#2)
- [3 - Define model](#3)
- [4 - Result](#4)

# 1 - Packages <a name="#1"></a>

In [3]:
import numpy as np 
import pandas as pd 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import os
import pickle

# 2 - Load and Preprocess data <a name="#2"></a>

In [148]:
data = pd.read_csv("/kaggle/input/processed-all-datas/Processed_all_datas.csv")
#data = pd.read_csv("/content/Processed_all_datas.csv")
print(data.shape)
data.head()



(8600, 13)


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_date
0,US,51632887,R3B581VNKYCP16,B00HFPOXM4,114966677,Garmin Vivofit Fitness Band,Wireless,5.0,5854.0,6063.0,N,Y,2014-03-12
1,US,49452274,RT0KPKVSQD0HI,B00A17IAO0,979081054,UP by Jawbone Wristband,Wireless,5.0,4856.0,5029.0,N,Y,2012-11-27
2,US,44086587,RYBUAAD9JZ1VW,B001S2RCWI,835787812,Garmin Portable Friction Dashboard Mount,Wireless,5.0,4017.0,4048.0,N,Y,2009-06-14
3,US,49452274,R2Z0F95XGL71C6,B00GOGV314,190508754,"UP24 by Jawbone Wristband, Retail Packaging",Wireless,5.0,3914.0,4022.0,N,N,2013-12-08
4,US,18464808,RZ0J3PVMPU4CJ,B00DGEGJ02,212863722,"Wemo Wi-Fi enabled, Works with Amazon Alexa",Wireless,1.0,3667.0,3894.0,N,Y,2013-11-01


Remove Duplicate Data

In [150]:
data.dropna(subset=['marketplace','product_parent','product_title','product_category'],inplace=True,axis=0)
data=data.drop_duplicates(subset=['product_id'])
print(data.shape)

(5946, 13)


# 3 - Define model <a name="#3"></a>

In [211]:
from numpy.lib.type_check import real_if_close
class model_data():
  def __init__(self):
    pass
  def cos_sim(self):
      vectorizer = TfidfVectorizer()
      matrix = vectorizer.fit_transform(self.data["combined"])
      self.cosine_similarities = linear_kernel(matrix,matrix)

  def train(self,data):
    self.olddata=data.copy()
    data.dropna(subset=['marketplace','product_parent','product_title','product_category'],inplace=True,axis=0)
    data=data.drop_duplicates(subset=['product_id'])
    data = data.reset_index(drop=True)
    data['marketplace'] = [re.sub(r'[^\w\s]', '', str(t)) for t in data['marketplace']]
    data['product_parent'] = [re.sub(r'[^\w\s]', '', str(t)) for t in data['product_parent']]
    data['product_title'] = [re.sub(r'[^\w\s]', '', str(t)) for t in data['product_title']]
    data['product_category'] = [re.sub(r'[^\w\s]', '', str(t)) for t in data['product_category']]
    data["combined"] = data['marketplace'] + '  ' + data['product_parent'] + ' ' + data['product_title'] + ' ' + data['product_category'] + '  ' + data['product_parent'] + ' ' + data['product_category'] + '  ' + data['product_parent']
    data.drop(['marketplace','product_parent','product_category'],axis=1,inplace=True)
    self.data=data
    self.cos_sim()





class Content_Based_Model():
  def __init__(self):
    pass

  def train(self,data):
    self.modeldata = model_data()
    self.modeldata.train(data)

  def export_weight(self,path):
    
    file = open(path, 'wb')
    pickle.dump([self.modeldata.olddata,self.modeldata.data],file,protocol=0)
      
  def load(self,path):
    file = open(path, 'rb') 
    real_list= pickle.load(file)
    self.modeldata = model_data()
    self.modeldata.olddata=real_list[0]
    self.modeldata.data=real_list[1]
    self.modeldata.cos_sim()
   

  def get_combined_data(self):
    return self.modeldata.data["combined"]

  def recommender_to_csv(self,title,limit,filename):
    predicted=self.content_recommender(title,limit)
    pd.DataFrame(predicted).to_csv(filename)
  
  def content_recommender(self,title,limit):
    p_title = self.modeldata.data['product_id']
    indices = pd.Series(self.modeldata.data.index, index=self.modeldata.data['product_id'])
    idx = indices[title]
    sim_scores = list(enumerate(self.modeldata.cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) #truth value error

    sim_scores = sim_scores[1:limit+1]
    product_indices = [i[0] for i in sim_scores]
    p_recommend = pd.merge(p_title.iloc[product_indices], self.modeldata.olddata)
    return p_recommend


# 4 - Results <a name="#4"></a>

In [212]:
models=Content_Based_Model()
models.train(data)
models.export_weight("model_tfidf.h4")


In [213]:
models=Content_Based_Model()
models.load("model_tfidf.h4")
models.get_combined_data().head(10)

0    US  114966677 Garmin Vivofit Fitness Band Wire...
1    US  979081054 UP by Jawbone Wristband Wireless...
2    US  835787812 Garmin Portable Friction Dashboa...
3    US  190508754 UP24 by Jawbone Wristband Retail...
4    US  212863722 Wemo WiFi enabled Works with Ama...
5    US  97226392 LG Tone Wireless Bluetooth Stereo...
6    US  26681904 Garmin Nuvi 2595LMT Wireless  266...
7    US  526049974 Misfit Shine Activity Monitor Wi...
8    US  778294373 TomTom XXL 540TM 5Inch Widescree...
9    US  164315283 Garmin nuvi 1490LMT 5Inch Blueto...
Name: combined, dtype: object

In [214]:
models.content_recommender("B00166N6SA",16)


Unnamed: 0,product_id,marketplace,customer_id,review_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_date
0,B00009WNZA,US,41768566,R35Y5RH3IHI1RZ,171514791,The Sims 2,Video Games,5.0,1411.0,1503.0,N,N,2003-09-11
1,B002I0KOLA,US,51406859,R1FAZQ008FL8PG,876522540,The Sims Medieval,Video Games,4.0,610.0,631.0,N,Y,2011-03-23
2,B0002HTAYS,US,30412978,R3MSZ5HXX1LF08,67580798,The Sims 2 (Special DVD Edition),Video Games,5.0,759.0,782.0,N,N,2004-09-17
3,B00EFRN2IQ,US,32039478,R326VV67XAHBH0,793022980,Sims 4,Video Games,2.0,1720.0,1851.0,N,N,2014-09-02
4,B007CM0K86,US,19902055,R320J8FLB7RU5H,18715758,The Last of Us,Video Games,5.0,786.0,868.0,N,N,2013-06-16
5,B00JK00S0S,US,11109353,R36S909XPQ8R1E,715643191,The Last of Us,Video Games,5.0,575.0,642.0,N,N,2014-07-29
6,B002I0HJZO,US,14124631,R3U592MEECO32W,28899677,Battlefield 3,Video Games,1.0,852.0,1088.0,N,Y,2011-10-31
7,B000084318,US,50562473,R2ZE4SNBL9CIFD,731692291,The Legend of Zelda: The Wind Waker,Video Games,5.0,620.0,647.0,N,N,2003-01-14
8,B0009VXAM0,US,52759271,R2BOJDNW0G80JJ,320220176,PlayStation 3,Video Games,5.0,2403.0,2763.0,N,N,2006-11-18
9,B00004SQPD,US,49837039,R3RTNVH0DY66U6,46715088,PlayStation 2,Video Games,5.0,999.0,1056.0,N,N,2004-10-25


In [215]:
models.recommender_to_csv("B00166N6SA",16,"My_Page.csv")