In [3]:
import pandas as pd
import numpy as np
import ast  # For converting string representation of lists to actual lists
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Load datasets
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

# Merge the datasets on 'title'
movies = movies.merge(credits, on='title')

# Selecting useful columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Drop rows with missing values
movies.dropna(inplace=True)

# Function to convert JSON-like strings into list of relevant names
def convert(obj):
    try:
        obj = ast.literal_eval(obj)
        return [i['name'] for i in obj]
    except:
        return []

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# Function to extract top 3 cast members
def get_top_3_cast(obj):
    try:
        obj = ast.literal_eval(obj)
        return [i['name'] for i in obj[:3]]
    except:
        return []

movies['cast'] = movies['cast'].apply(get_top_3_cast)

# Function to extract director name
def get_director(obj):
    try:
        obj = ast.literal_eval(obj)
        for i in obj:
            if i['job'] == 'Director':
                return [i['name']]
        return []
    except:
        return []

movies['crew'] = movies['crew'].apply(get_director)

# Convert 'overview' column to a list
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Remove spaces between words
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

# Create new_df as a copy of movies for further modifications
new_df = movies.copy()

# Create 'tags' column by combining all text columns
new_df['tags'] = new_df['overview'] + new_df['genres'] + new_df['keywords'] + new_df['cast'] + new_df['crew']

# Convert 'tags' to string
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

# Convert to lowercase
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

# Initialize Porter Stemmer
ps = PorterStemmer()

def stem(text):
    return " ".join([ps.stem(word) for word in text.split()])

new_df['tags'] = new_df['tags'].apply(stem)

# Vectorization using CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()

# Compute cosine similarity
similarity = cosine_similarity(vectors)

# Function to recommend movies
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    for i in movie_list:
        print(new_df.iloc[i[0]].title)

# Save the model and data
pickle.dump(new_df, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

pickle.dump(new_df.to_dict(),open('movie_dict.pkl','wb'))
new_df['title'].values

# Example usage
recommend("Avatar")




Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.
