In [None]:
# Movie Recommendation System - Complete Implementation

# Cell 1: Import required libraries
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
import pickle
import nltk

# Cell 2: Download required NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
except:
    pass

# Cell 3: Load and merge datasets
print("Loading datasets...")
movies_raw = pd.read_csv('tmdb_5000_movies.csv')
credits_raw = pd.read_csv('tmdb_5000_credits.csv')

# Merge datasets on title
movies = movies_raw.merge(credits_raw, on='title')
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'vote_average', 'vote_count']]
movies.dropna(inplace=True)

print(f"Dataset loaded with {len(movies)} movies")
movies.head()

# Cell 4: Calculate weighted ratings using IMDB formula
print("Calculating weighted ratings...")

# Calculate the mean vote across all movies (C)
C = movies['vote_average'].mean()
print(f"Mean vote average: {C:.2f}")

# Calculate the minimum number of votes required (90th percentile) (m)
m = movies['vote_count'].quantile(0.9)
print(f"90th percentile vote count: {m}")

# Filter movies that meet the minimum vote count threshold
q_movies = movies.copy().loc[movies['vote_count'] >= m]
print(f"Qualified movies: {len(q_movies)}")

# Function to calculate weighted rating based on IMDB formula
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v / (v + m) * R) + (m / (v + m) * C)

# Add the 'score' column to qualified movies
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

# Sort movies by score in descending order
q_movies = q_movies.sort_values('score', ascending=False)

# Display top movies by weighted score
print("Top 10 movies by weighted score:")
print(q_movies[['title', 'vote_average', 'vote_count', 'score']].head(10))

# Use qualified movies for our recommendation model
movies = q_movies.copy()

# Cell 5: Define helper functions for data processing
def convert(text):
    """Convert JSON-like strings to list of names"""
    L = []
    try:
        for i in ast.literal_eval(text):
            L.append(i['name'])
    except:
        pass
    return L

def convert_cast(text):
    """Get top 3 cast members"""
    L = []
    counter = 0
    try:
        for i in ast.literal_eval(text):
            if counter < 3:
                L.append(i['name'])
                counter += 1
    except:
        pass
    return L

def fetch_director(text):
    """Fetch director name from crew"""
    L = []
    try:
        for i in ast.literal_eval(text):
            if i['job'] == 'Director':
                L.append(i['name'])
                break
    except:
        pass
    return L

def collapse(L):
    """Remove spaces between names to create single tags"""
    L1 = []
    for i in L:
        L1.append(i.replace(" ", ""))
    return L1

# Cell 6: Define weighted processing functions
def process_genres(obj):
    """Process and weight genres by repeating them"""
    return collapse(convert(obj)) * 2  # Weight genres more

def process_keywords(obj):
    """Process and weight keywords by repeating them"""
    return collapse(convert(obj)) * 2  # Weight keywords more

def process_director(obj):
    """Process and weight director by repeating them"""
    return collapse(fetch_director(obj)) * 3  # Weight director most

# Cell 7: Apply transformations with weighting
print("Processing movie features...")

# Apply weighted processing to increase importance of key features
movies['genres'] = movies['genres'].apply(process_genres)
movies['keywords'] = movies['keywords'].apply(process_keywords)
movies['cast'] = movies['cast'].apply(convert_cast).apply(collapse)
movies['crew'] = movies['crew'].apply(process_director)  # This now contains weighted directors
movies['overview'] = movies['overview'].apply(lambda x: x.split() if pd.notna(x) else [])

# Display sample processed data
print("Sample processed features:")
sample_movie = movies.iloc[0]
print(f"Title: {sample_movie['title']}")
print(f"Genres: {sample_movie['genres'][:5]}...")  # Show first 5 elements
print(f"Keywords: {sample_movie['keywords'][:5]}...")
print(f"Cast: {sample_movie['cast']}")
print(f"Director: {sample_movie['crew']}")

# Cell 8: Create tags and final dataframe
print("Creating combined tags...")

# Combine all processed features into a single 'tags' column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Create the final dataframe with essential columns
new_df = movies[['movie_id', 'title', 'tags', 'score']].copy()

# Convert the list of tags into a single, lowercase string
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x).lower())

print(f"Final dataset shape: {new_df.shape}")
print("Sample tags:")
print(new_df.iloc[0]['tags'][:200] + "...")

# Cell 9: Apply stemming
print("Applying stemming to reduce word variations...")

# Initialize Porter Stemmer
ps = PorterStemmer()

def stem(text):
    """Apply stemming to text"""
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

# Apply stemming to tags
new_df['tags'] = new_df['tags'].apply(stem)

print("Sample stemmed tags:")
print(new_df.iloc[0]['tags'][:200] + "...")

# Cell 10: Create TF-IDF vectors
print("Creating TF-IDF vectors...")

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

# Transform tags into numerical vectors
vectors = tfidf.fit_transform(new_df['tags']).toarray()

print(f"TF-IDF vector shape: {vectors.shape}")
print(f"Feature names sample: {tfidf.get_feature_names_out()[:10]}")

# Calculate cosine similarity
print("Computing cosine similarity matrix...")
similarity = cosine_similarity(vectors)

print(f"Similarity matrix shape: {similarity.shape}")
print(f"Sample similarities for first movie: {similarity[0][:5]}")

# Cell 11: Fix index and test the recommendation function

# FIX: Reset the index to ensure it is sequential from 0 to n-1
new_df = new_df.reset_index(drop=True)
print("DataFrame index has been reset to prevent errors.")

def recommend(movie, movies_df, similarity_matrix):
    """Test function to get basic recommendations"""
    try:
        # Get the index (position) of the movie
        index = movies_df[movies_df['title'] == movie].index[0]
    except IndexError:
        return "Movie not found"
    
    distances = similarity_matrix[index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])
    
    recommendations = []
    for i in movies_list[1:6]:  # Top 5, excluding the movie itself
        movie_info = movies_df.iloc[i[0]]
        recommendations.append({
            'title': movie_info['title'],
            'score': round(movie_info['score'], 2),
            'similarity': round(i[1], 3)
        })
    
    return recommendations

# Test with a popular movie
test_movie = new_df.iloc[0]['title']
print(f"\nTesting recommendations for: {test_movie}")
test_recommendations = recommend(test_movie, new_df, similarity)
for i, rec in enumerate(test_recommendations, 1):
    print(f"{i}. {rec['title']} (Score: {rec['score']}, Similarity: {rec['similarity']})")

# Cell 12: Save the model and data
print("\nSaving model files...")

# Save the processed dataframe and similarity matrix
pickle.dump(new_df.to_dict(), open('movies_list_enhanced.pkl', 'wb'))
pickle.dump(similarity, open('similarity_enhanced.pkl', 'wb'))

print("Model and data saved successfully!")
print("Files created:")
print("- movies_list_enhanced.pkl")
print("- similarity_enhanced.pkl")

# Cell 13: Verify saved files
print("\nVerifying saved files...")

# Load and verify the saved files
try:
    loaded_movies = pickle.load(open('movies_list_enhanced.pkl', 'rb'))
    loaded_similarity = pickle.load(open('similarity_enhanced.pkl', 'rb'))
    
    loaded_df = pd.DataFrame(loaded_movies)
    
    print(f"Loaded movies shape: {loaded_df.shape}")
    print(f"Loaded similarity shape: {loaded_similarity.shape}")
    print("✅ Files loaded successfully!")
    
    # Test one more recommendation with loaded data
    print(f"\nFinal test with loaded data for: {test_movie}")
    final_test = recommend(test_movie, loaded_df, loaded_similarity)
    for i, rec in enumerate(final_test[:3], 1):
        print(f"{i}. {rec['title']} (Score: {rec['score']})")
    
except Exception as e:
    print(f"❌ Error loading files: {e}")

print("\n🎉 Movie Recommendation System setup complete!")
print("You can now run the Streamlit app with: streamlit run app.py")