In [89]:
import time
import os
import pandas as pd
import numpy as np
import ast
import re
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

OMDB_API_KEY = "4ab406f9"

In [76]:
# Load datasets
credits_df = pd.read_csv('credits.csv')
movies_df = pd.read_csv('movies.csv')

# Merge datasets
movies = movies_df.merge(credits_df, on='title')
# Keep required columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'popularity']]

In [77]:
# Drop null values
movies.dropna(inplace=True)

# Convert stringified lists into Python lists
def convert_list(obj):
    return [i['name'] for i in ast.literal_eval(obj)]

movies['genres'] = movies['genres'].apply(convert_list)
movies['keywords'] = movies['keywords'].apply(convert_list)

def fetch_lead_actors(obj):
    return [i['name'] for i in ast.literal_eval(obj)[:3]]

def fetch_director(obj):
    return [i['name'] for i in ast.literal_eval(obj) if i['job'] == 'Director']

movies['cast'] = movies['cast'].apply(fetch_lead_actors)
movies['crew'] = movies['crew'].apply(fetch_director)

In [78]:
# Normalize popularity
scaler = MinMaxScaler()
movies['popularity_log'] = np.log1p(movies['popularity'])
movies['popularity_log_norm'] = scaler.fit_transform(movies[['popularity_log']])

In [79]:
# Remove spaces in tags
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

In [80]:
def pre_processing(content):
    # Only keeping alphabets in the content 
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    # Removing spaces and converting to lowercase
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    return [word for word in stemmed_content if not word in stopwords.words('english')]

In [81]:
movies['tags'] = movies['overview'].apply(pre_processing).apply(lambda x: ' '.join(x)) + ' ' + \
                    movies['genres'].apply(lambda x: ' '.join(x)) + ' ' + \
                    movies['keywords'].apply(lambda x: ' '.join(x)) + ' ' + \
                    movies['cast'].apply(lambda x: ' '.join(x)) + ' ' + \
                    movies['crew'].apply(lambda x: ' '.join(x))

In [82]:
final_movies_df = movies[['movie_id', 'title', 'overview', 'cast', 'crew', 'popularity_log_norm', 'tags']]

In [83]:
final_movies_df['tags'] = final_movies_df['tags'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_movies_df['tags'] = final_movies_df['tags'].str.lower()


In [None]:
def fetch_omdb_ratings(movie_title):
    time.sleep(0.5)
    url = f"https://www.omdbapi.com/?t={movie_title}&apikey={OMDB_API_KEY}"
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        
        # Extract IMDb rating
        imdb_score = data.get("imdbRating")  # Example: "8.3"
        
        # Extract Rotten Tomatoes score (if available)
        rt_score = None
        if "Ratings" in data:
            for rating in data["Ratings"]:
                if rating["Source"] == "Rotten Tomatoes":
                    rt_score = rating["Value"].strip('%')  # Example: "87"

        return pd.Series([imdb_score, int(rt_score) if rt_score else None])  # Convert RT score to int

    except requests.exceptions.RequestException as e:
        print(f"Error fetching OMDb ratings for {movie_title}: {e}")
        return pd.Series([None, None])  # Return None if there's an error

In [90]:
SAVE_FILE = "movies_with_ratings.csv"
if os.path.exists(SAVE_FILE):
    movies_ratings = pd.read_csv(SAVE_FILE)
else:
    movies_ratings = final_movies_df[['movie_id', 'title', 'overview', 'cast', 'crew', 'popularity_log_norm', 'tags']].copy()
    movies_ratings['imdb_score'] = None
    movies_ratings['rt_score'] = None

# Process each movie one by one
for index, row in movies_ratings.iterrows():
    if pd.notna(row["imdb_score"]):  # Skip if already fetched
        continue  

    print(f"🔍 Fetching ratings for: {row['title']}...")
    imdb, rt = fetch_omdb_ratings(row["title"])
    
    # Store results in DataFrame
    movies_ratings.at[index, "imdb_score"] = imdb
    movies_ratings.at[index, "rt_score"] = rt

    # Save after each successful request
    movies_ratings.to_csv(SAVE_FILE, index=False)
    
    print(f"✅ Saved: {row['title']} - IMDb: {imdb}, RT: {rt}%")

    time.sleep(1)  # Avoid hitting rate limits

print("🎉 Done! All available ratings saved.")

🔍 Fetching ratings for: Avatar...
✅ Saved: Avatar - IMDb: 7.9, RT: 81%
🔍 Fetching ratings for: Pirates of the Caribbean: At World's End...
✅ Saved: Pirates of the Caribbean: At World's End - IMDb: 7.1, RT: 43%
🔍 Fetching ratings for: Spectre...
✅ Saved: Spectre - IMDb: 6.8, RT: 63%
🔍 Fetching ratings for: The Dark Knight Rises...
✅ Saved: The Dark Knight Rises - IMDb: 8.4, RT: 87%
🔍 Fetching ratings for: John Carter...
✅ Saved: John Carter - IMDb: 6.6, RT: 52%
🔍 Fetching ratings for: Spider-Man 3...
✅ Saved: Spider-Man 3 - IMDb: 6.3, RT: 63%
🔍 Fetching ratings for: Tangled...
✅ Saved: Tangled - IMDb: 7.7, RT: 89%
🔍 Fetching ratings for: Avengers: Age of Ultron...
✅ Saved: Avengers: Age of Ultron - IMDb: 7.3, RT: 76%
🔍 Fetching ratings for: Harry Potter and the Half-Blood Prince...
✅ Saved: Harry Potter and the Half-Blood Prince - IMDb: 7.6, RT: 83%
🔍 Fetching ratings for: Batman v Superman: Dawn of Justice...
✅ Saved: Batman v Superman: Dawn of Justice - IMDb: 6.5, RT: 29%
🔍 Fetching r

KeyboardInterrupt: 

In [92]:
ps = PorterStemmer()
def stem(text):
    return " ".join([ps.stem(word) for word in text.split()])
    
movies_ratings['tags'] = movies_ratings['tags'].apply(stem)

In [94]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(movies_ratings['tags']).toarray()
similarity = cosine_similarity(vectors)

In [101]:
def handle_missing_values(x):
    if isinstance(x, (int, float)):  # If the value is numeric
        return 0 if np.isnan(x) else x  # Replace NaN with 0
    elif pd.isna(x) or x == '' or x is None:  # If the value is empty (NaN, None, empty string)
        return 0  # Replace empty or None with 0
    else:
        return x  # If the value is valid, leave it unchanged

In [102]:
movies_ratings['imdb_score'] = movies_ratings['imdb_score'].apply(handle_missing_values)
movies_ratings['rt_score'] = movies_ratings['rt_score'].apply(handle_missing_values)
movies_ratings[['movie_id', 'title', 'overview', 'cast', 'crew', 'popularity_log_norm', 'imdb_score', 'rt_score']].to_csv('processed_movies.csv', index=False)
np.save('similarity_matrix.npy', similarity)