# Representing Words as vectors

In [None]:
!pip install gensim sentence-transformers

# 01. Word2Vec

### Words become numbers

In [None]:
import gensim.downloader as api

word2vec = api.load("glove-wiki-gigaword-100")

In [None]:
vector = word2vec["king"]

print("Vector shape:", vector.shape)
print(vector)  

### Let's check simantics

In [None]:
word2vec.similarity("king", "queen")

In [None]:
word2vec.similarity("king", "car")

In [None]:
word2vec.most_similar("king", topn=5)

In [None]:
vec = word2vec["king"] - word2vec["boy"] + word2vec["girl"]
word2vec.similar_by_vector(vec, topn=5)

## 02. Sentence embeddings

In [None]:
from sentence_transformers import SentenceTransformer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Available device: {device}")

embedding_model = SentenceTransformer(
    "all-MiniLM-L6-v2",
    device=device
)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
embedding_model.encode("I Like computer science")

# 03. Recommandation System

### Understand Data

In [None]:
import pandas as pd

df = pd.read_csv("../datasets/movies.csv")
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
def combine_fields(row):
    return (
        f"{row['title']} directed by {row['director']}. "
        f"Genre: {row['genres']}. "
        f"Starring: {row['actors']}. "
        f"{row['description']}"
    )

df['full_text'] = df.apply(combine_fields, axis=1)

In [None]:
df.head()

In [None]:
df["full_text"][0]

In [None]:
desc_vector = embedding_model.encode(df['full_text'])

desc_vector.shape

### Model training

In [None]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=10, metric='cosine')
knn.fit(desc_vector)

### Model use

In [None]:
def recommend_movies(id, no_of_movies=5):
    movie_row = df.loc[df["id"] == 1000]
    print(f"Selected Movie: {movie_row['title']}")
    query_vector = embedding_model.encode([movie_row['full_text']])
    distances, indices = knn.kneighbors(query_vector, n_neighbors=no_of_movies+1)  
    recommended_df = df.iloc[indices[0][1:]]  

    return recommended_df

recommend_movies(320, 10)

In [None]:
def recommend_movies_by_text(text, no_of_movies=5):
    query_vector = embedding_model.encode([text])
    distances, indices = knn.kneighbors(query_vector, n_neighbors=no_of_movies)  
    recommended_df = df.iloc[indices[0]]  

    return recommended_df

recommend_movies_by_text("The movie about spider", 10)

### Save recommandation model

In [None]:
import joblib

joblib.dump(knn, "../movie_project/knn.pkl")
joblib.dump(embedding_model, "../movie_project/embedding_model.pkl")