# Representing Words as vectors

In [None]:
!pip install scikit-learn sentence-transformers

## 01. Load embedding model
Embedding model is used to encode text into description

In [None]:
from sentence_transformers import SentenceTransformer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Available device: {device}")

embedding_model = SentenceTransformer(
    "all-MiniLM-L6-v2",
    device=device
)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

## 02. Understand Data

In [None]:
import pandas as pd

df = pd.read_csv("/content/movies.csv")
df.head()

In [None]:
df.shape

In [None]:
df.columns

## 03. Generate vectors for each movies

In [None]:
# To find the vectors, well start by combining all our fields into 1 column

def combine_fields(row):
    return (
        f"Genres: {row['genres']}.\n"
        f"Starring: {row['actors']}. \n"
        f"Director: {row['director']}. \n"
        f"Title: {row['title']}. \n"
        f"Description: {row['description']}"
    )


df['full_text'] = df.apply(combine_fields, axis=1)

In [None]:
df.head()

In [None]:
print(df["full_text"][0])

In [None]:
embedding_model.encode(df['full_text'][0])

In [None]:
desc_vector = embedding_model.encode(df['full_text'])

desc_vector.shape

In [None]:
df['vector'] = desc_vector.tolist()

In [None]:
df.head()

### Model training

In [None]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=20, metric='cosine')
knn.fit(desc_vector)

### Model use

In [None]:
def recommend_movies(id, no_of_movies):
    movie_df = df.loc[df["id"] == id]

    if movie_df.empty:
        return []

    movie_row = movie_df.iloc[0]

    print(f"Selected Movie: {movie_row['title']}")
    query_vector = movie_row['vector']
    distances, indices = knn.kneighbors([query_vector], n_neighbors=no_of_movies+1)

    indices = indices[0][1:]
    distances = distances[0][1:]
    recommended_df = df.iloc[indices].copy()

    recommended_df["similarity_score"] = 1 - distances

    return recommended_df

recommend_movies(301, 5)

In [None]:

def recommend_movies(id, no_of_movies, rating_weight=0.2):
    movie_df = df.loc[df["id"] == id]

    if movie_df.empty:
        return []

    movie_row = movie_df.iloc[0]

    query_vector = movie_row["vector"]

    distances, indices = knn.kneighbors(
        [query_vector], n_neighbors=no_of_movies + 1
    )

    indices = indices[0][1:]
    distances = distances[0][1:]

    recommended_df = df.iloc[indices].copy()

    recommended_df["similarity_score"] = 1 - distances

    # Reranking Part
    rating_norm = recommended_df["rating"] / 10.0

    recommended_df["boosted_score"] = (
        recommended_df["similarity_score"]
        * (1 + rating_weight * rating_norm)
    )

    recommended_df = recommended_df.sort_values(
        by="boosted_score", ascending=False
    )

    recommended_df = recommended_df.drop(columns=["boosted_score"])

    return recommended_df


### Save recommandation model

In [None]:
import joblib

joblib.dump(df, "/content/data_frame.pkl")
joblib.dump(knn, "/content/knn_model.pkl")