In [1]:
# import pandas
import pandas as pd

# import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# import data
df = pd.read_csv("https://raw.githubusercontent.com/nikitaa30/Content-based-Recommender-System/master/sample-data.csv")

In [3]:
df

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."
...,...,...
495,496,Cap 2 bottoms - Cut loose from the maddening c...
496,497,Cap 2 crew - This crew takes the edge off fick...
497,498,All-time shell - No need to use that morning T...
498,499,All-wear cargo shorts - All-Wear Cargo Shorts ...


* explore DataFrame

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           500 non-null    int64 
 1   description  500 non-null    object
dtypes: int64(1), object(1)
memory usage: 7.9+ KB


In [7]:
print(df['description'].sample(1))

86    Merino 1 crew - In a day packed with decisions...
Name: description, dtype: object


We will be using Tf-Idf to find similar items based on description
* instantiate TF-IDF

In [8]:
tfidf = TfidfVectorizer(stop_words='english')

* fit and transform 'description' column with TFIDF

In [10]:
my_matrix = tfidf.fit_transform(df['description'])

In [11]:
my_matrix.shape

(500, 4816)

* calculate the cosine similarity of each item with every other item in the dataset, 

In [34]:
my_cosine = cosine_similarity(my_matrix, my_matrix)

In [35]:
my_cosine

array([[1.        , 0.32792053, 0.20819843, ..., 0.17696975, 0.20143942,
        0.22598052],
       [0.32792053, 1.        , 0.5673509 , ..., 0.12925175, 0.21139731,
        0.19396413],
       [0.20819843, 0.5673509 , 1.        , ..., 0.13509939, 0.14185763,
        0.15717399],
       ...,
       [0.17696975, 0.12925175, 0.13509939, ..., 1.        , 0.14187074,
        0.17045334],
       [0.20143942, 0.21139731, 0.14185763, ..., 0.14187074, 1.        ,
        0.55846363],
       [0.22598052, 0.19396413, 0.15717399, ..., 0.17045334, 0.55846363,
        1.        ]])

* sort all items using their similarity for each item i, and store the values in dictionary `results`

```
results = {
    "1": [5,7,9...],
    "2": [45,2,3...]
}
```

In [37]:
my_cosine.shape

(500, 500)

In [38]:
results = {} #empty dict to append results

for i in range(len(my_cosine)): #loop over indicies in range of length of my_cosine
    results[str(i+1)] = list(my_cosine[i]) # append number as string to key and assign correspong values as list


print(len(results)) #check to see if len is till same

500


* create function `recommender` that will recommend similar products
    * function must have two input params: **item_id** and **count** of similar products 

In [39]:
# recommender function to recommend similar products

def recommender(item_id, count):
    # need to get index
    idx = [item_id]

    #get pairwise similarity with id
    similarity_scores = list(enumerate(my_cosine[idx]))
    
    #sort based on similarit
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    #get scores of count
    similarity_scores = similarity_scores[:count+1]

    #get product similarity
    similar_ids = [i[0] for i in similarity_scores]

    #return num products that are equal to count
    top_results = df['id'].iloc[similar_ids]
    return top_results

* show top 5 the most similar items for item with idem_id = 11

In [41]:
#something went wrong need more, start again
recommender(item_id=11, count=5)

0    1
Name: id, dtype: int64