## Content based filtering

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [14]:
data = {
    "title": [
        "The Matrix",
        "Inception",
        "Interstellar",
        "The Dark Knight",
        "Forrest Gump",
        "The Godfather",
        "Pulp Fiction",
        "The Avengers",
        "Titanic",
        "Shutter Island"
    ],
    "genre": [
        "Sci-Fi",
        "Sci-Fi",
        "Sci-Fi",
        "Action",
        "Drama",
        "Crime",
        "Crime",
        "Action",
        "Romance",
        "Thriller"
    ]
}
expanded_genres = {
    "Sci-Fi": "Science Fiction",
    "Action": "Action & Adventure",
    "Drama": "Drama & Biography",
    "Crime": "Crime & Mystery",
    "Romance": "Romantic Drama",
    "Thriller": "Psychological Thriller"
}


movies = pd.DataFrame(data)
movies["genre"] = movies["genre"].map(expanded_genres)

In [15]:
movies

Unnamed: 0,title,genre
0,The Matrix,Science Fiction
1,Inception,Science Fiction
2,Interstellar,Science Fiction
3,The Dark Knight,Action & Adventure
4,Forrest Gump,Drama & Biography
5,The Godfather,Crime & Mystery
6,Pulp Fiction,Crime & Mystery
7,The Avengers,Action & Adventure
8,Titanic,Romantic Drama
9,Shutter Island,Psychological Thriller


In [18]:
tf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tf.fit_transform(movies['genre'])

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

print(cosine_sim_matrix)

[[1.         1.         1.         0.         0.         0.
  0.         0.         0.         0.        ]
 [1.         1.         1.         0.         0.         0.
  0.         0.         0.         0.        ]
 [1.         1.         1.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         1.         0.         0.
  0.         1.         0.         0.        ]
 [0.         0.         0.         0.         1.         0.
  0.         0.         0.41950082 0.        ]
 [0.         0.         0.         0.         0.         1.
  1.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         1.
  1.         0.         0.         0.        ]
 [0.         0.         0.         1.         0.         0.
  0.         1.         0.         0.        ]
 [0.         0.         0.         0.         0.41950082 0.
  0.         0.         1.         0.        ]
 [0.         0.         0.         0.

In [23]:
tfidf_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.70710678, 0.        , 0.        , 0.        , 0.70710678,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.70710678, 0.        , 0.        , 0.        , 0.70710678,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.70710678, 0.        , 0.        , 0.        , 0.70710678,
        0.        ],
       [0.70710678, 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.76190497, 0.        , 0.64768883,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.70710678, 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.70710678, 0.

In [29]:
#recommend based on the index
def recommend(index):
  sim_scores = list(enumerate(cosine_sim_matrix[index]))
  sim_scores = sorted(sim_scores,key= lambda x:x[1],reverse = True)
  recs = [movies.iloc[i]['title'] for i, _ in sim_scores[1:4]]
  return recs

print("recommendations for Interstellar",recommend(2))

recommendations for Interstellar ['Inception', 'Interstellar', 'The Dark Knight']


## User Collaberative based filtering


In [30]:
import numpy as np

In [31]:
#user item rating matix
ratings = np.array([
    [5,4,0,0],
    [4,0,0,2],
    [0,0,4,4],
    [2,2,0,5]
])

In [32]:
#cosine similarity between users
from sklearn.metrics.pairwise import cosine_similarity
user_sim = cosine_similarity(ratings)

user_sim

array([[1.        , 0.6984303 , 0.        , 0.48935452],
       [0.6984303 , 1.        , 0.31622777, 0.70064905],
       [0.        , 0.31622777, 1.        , 0.61545745],
       [0.48935452, 0.70064905, 0.61545745, 1.        ]])