In [1]:
# import pandas
import pandas as pd

# import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# import data
df = pd.read_csv("https://raw.githubusercontent.com/nikitaa30/Content-based-Recommender-System/master/sample-data.csv")

In [4]:
df.head()

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."


* explore DataFrame

In [5]:
df.shape

(500, 2)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           500 non-null    int64 
 1   description  500 non-null    object
dtypes: int64(1), object(1)
memory usage: 7.9+ KB


We will be using Tf-Idf to find similar items based on description
* instantiate TF-IDF

In [8]:
tfidf = TfidfVectorizer(stop_words='english')

* fit and transform 'description' column with TFIDF

In [9]:
tfidf_matrix = tfidf.fit_transform(df.description)

In [10]:
tfidf_matrix.shape

(500, 4600)

* calculate the cosine similarity of each item with every other item in the dataset, 

In [11]:
cosineSim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [12]:
cosineSim.shape

(500, 500)

In [14]:
cosineSim

array([[1.        , 0.31005145, 0.18891957, ..., 0.14812382, 0.18521397,
        0.20070706],
       [0.31005145, 1.        , 0.57514356, ..., 0.11131481, 0.2053139 ,
        0.18008906],
       [0.18891957, 0.57514356, 1.        , ..., 0.10043647, 0.12778935,
        0.14410777],
       ...,
       [0.14812382, 0.11131481, 0.10043647, ..., 1.        , 0.11674521,
        0.14302157],
       [0.18521397, 0.2053139 , 0.12778935, ..., 0.11674521, 1.        ,
        0.57835324],
       [0.20070706, 0.18008906, 0.14410777, ..., 0.14302157, 0.57835324,
        1.        ]])

In [44]:
results = dict()
X = range(1,500)
idx = 1
for i in cosineSim:
    results[idx] = [x for _,x in sorted(zip(i,X))]
    idx+=1

In [52]:
len(results[3])

499

In [57]:
results[500][:10]

[371, 332, 490, 319, 48, 69, 396, 238, 455, 434]

* sort all items using their similarity for each item i, and store the values in dictionary `results`

```
results = {
    "1": [5,7,9...],
    "2": [45,2,3...]
}
```

* create function `recommender` that will recommend similar products
    * function must have two input params: **item_id** and **count** of similar products 

In [58]:
def recommender(item_id, count):
    return results[item_id][:count]

* show top 5 the most similar items for item with idem_id = 11

In [60]:
recommender(11,5)

[371, 332, 490, 49, 319]