In [None]:
import pandas as pd
import numpy as np
import ast

# Load movie and credits data
movie_details = pd.read_csv('tmdb_5000_movies.csv')
credit_details = pd.read_csv('tmdb_5000_credits.csv')

# Display data
print(movie_details.head())
print(credit_details.head())

In [None]:
# Check data dimensions
print(f"Movie details: {movie_details.shape}")
print(f"Credit details: {credit_details.shape}")

In [None]:
# Merge datasets on title
combined_movies = movie_details.merge(credit_details, on='title')
print(combined_movies.head())

In [None]:
# Data overview
combined_movies.info()

In [None]:
# Language distribution
print("Original language distribution:\n", combined_movies['original_language'].value_counts())

In [None]:
# Drop missing values
combined_movies.dropna(inplace=True)

In [None]:
# Check duplicates
print(f"Duplicates: {combined_movies.duplicated().sum()}")

In [None]:
# Convert JSON formatted data into lists of names
def extract_names(json_str):
    """Extracts names from JSON string. Each object must contain a 'name' key."""
    result = []
    data = ast.literal_eval(json_str)
    for item in data:
        result.append(item['name'])
    return result

# Apply conversion function
combined_movies['genres'] = combined_movies['genres'].apply(extract_names)
combined_movies['keywords'] = combined_movies['keywords'].apply(extract_names)
print(combined_movies.head())

In [None]:
# Text data processing
combined_movies['overview'] = combined_movies['overview'].apply(lambda x: x.split())

In [None]:
# Remove spaces for vectorization
columns = ['genres', 'cast', 'crew', 'keywords']
for column in columns:
    combined_movies[column] = combined_movies[column].apply(lambda x: [i.replace(' ', '') for i in x])

In [None]:
# Create tags by combining text features
combined_movies['tag'] = combined_movies['genres'] + combined_movies['cast'] + combined_movies['crew'] + combined_movies['keywords']
combined_movies['tag'] = combined_movies['tag'].apply(lambda x: ' '.join(x).lower())
print(combined_movies.head())

In [None]:
# Prepare data frame for model training
model_df = combined_movies[['movie_id', 'title', 'tag']]
print(model_df.head())

In [None]:
# Vectorization with CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=5000, stop_words='english')
vector = vectorizer.fit_transform(model_df['tag']).toarray()

In [None]:
# Calculate cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(vector)

In [None]:
# Recommend movies based on similarity scores
def recommend(movie):
    index = model_df[model_df['title'].str.lower() == movie.lower()].index[0]
    distances = similarity_scores[index]
    movie_list = sorted(list(enumerate(distances)), key=lambda x: x[1], reverse=True)[1:6]
    for i in movie_list:
        print(model_df.iloc[i[0]].title)

recommend('Batman Begins')