<a href="https://colab.research.google.com/github/Timoh-top/movie_recommender_streamlit/blob/main/movie_recommender_streamlit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# importing the needed libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast

In [None]:
# loading the dataset
movies = pd.read_csv("/content/tmdb_5000_movies.csv")

In [None]:
# observing the data attributes
movies.info()
movies.head()

In [None]:
# checking for the important columns
movies[['title', 'genres', 'overview', 'keywords']].info()
movies[['title', 'genres', 'overview', 'keywords']].head()

In [None]:
# fill missing overview
movies['overview'] = movies['overview'].fillna('')
movies["overview"].info()

**DATA PREPROCESSING**

In [None]:
import ast

def parse_features(x):
    try:
        items = ast.literal_eval(x)
        return ' '.join([i['name'] for i in items])
    except:
        return ''

In [None]:
movies['genres_parsed'] = movies['genres'].apply(parse_features)
movies['keywords_parsed'] = movies['keywords'].apply(parse_features)

movies['combined_features'] = movies['overview'] + ' ' + movies['genres_parsed'] + ' ' + movies['keywords_parsed']
movies['combined_features'].head()

**TF-IDF VECTORIZATION**

In [None]:
tfidf = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1, 2))

tfidf_matrix = tfidf.fit_transform(movies['combined_features'])

print(tfidf_matrix.shape)

In [None]:
# Cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
# Recommendation function
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

def recommend_movies(title, cosine_sim=cosine_sim, n=10):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]  # Skip the movie itself
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

# Example test
recommend_movies('The Dark Knight')

In [None]:
recommend_movies('Titanic')