In [2]:
import time
import os
import pandas as pd
import numpy as np
import ast
import re
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

OMDB_API_KEY = "4ab406f9"
SAVE_FILE = "movies_with_ratings.csv"

In [3]:
# Load datasets
credits_df = pd.read_csv('credits.csv')
movies_df = pd.read_csv('movies.csv')

movies_df.rename(columns={"id": "movie_id"}, inplace=True)

# Merge datasets
movies = movies_df.merge(credits_df, on='movie_id', how="inner")
movies.drop(columns=['title_x'], inplace=True)  # Drop 'title_x' or 'title_y' based on your preference

# Rename the remaining title column ('title_y' to 'title')
movies.rename(columns={'title_y': 'title'}, inplace=True)

# Optionally, reset the index again
movies.reset_index(drop=True, inplace=True)
# Keep required columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'popularity', 'release_date']]

In [4]:
# Drop null values
movies.dropna(inplace=True)

# Convert stringified lists into Python lists
def convert_list(obj):
    return [i['name'] for i in ast.literal_eval(obj)]

def fetch_lead_actors(obj):
    return [i['name'] for i in ast.literal_eval(obj)[:3]]

def fetch_director(obj):
    return [i['name'] for i in ast.literal_eval(obj) if i['job'] == 'Director']

movies['genres'] = movies['genres'].apply(convert_list)
movies['keywords'] = movies['keywords'].apply(convert_list)
movies['cast'] = movies['cast'].apply(fetch_lead_actors)
movies['crew'] = movies['crew'].apply(fetch_director)
movies['year'] = pd.to_datetime(movies['release_date'], format='%d/%m/%Y', errors='coerce').dt.year
required_columns = ['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'popularity', 'year']
def is_invalid_row(row):
    for col in required_columns:
        if isinstance(row[col], list) and not row[col]:  # Empty list check
            return True
        if row[col] in [None, '', np.nan]:  # Check for missing values
            return True
    return False
movies = movies[~movies.apply(is_invalid_row, axis=1)]
movies.reset_index(drop=True, inplace=True)

In [5]:
# Normalize popularity
scaler = MinMaxScaler()
movies['popularity_log'] = np.log1p(movies['popularity'])
movies['popularity_log_norm'] = scaler.fit_transform(movies[['popularity_log']])

In [6]:
# Remove spaces in tags
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

In [7]:
def pre_processing(content):
    # Only keeping alphabets in the content 
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    # Removing spaces and converting to lowercase
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    return [word for word in stemmed_content if not word in stopwords.words('english')]

In [8]:
movies['tags'] = movies['overview'].apply(pre_processing).apply(lambda x: ' '.join(x)) + ' ' + \
                    movies['genres'].apply(lambda x: ' '.join(x)) + ' ' + \
                    movies['keywords'].apply(lambda x: ' '.join(x)) + ' ' + \
                    movies['cast'].apply(lambda x: ' '.join(x)) + ' ' + \
                    movies['crew'].apply(lambda x: ' '.join(x))

In [9]:
final_movies_df = movies[['movie_id', 'title', 'overview', 'cast', 'crew', 'popularity_log_norm', 'year','tags']]

In [10]:
final_movies_df['tags'] = final_movies_df['tags'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_movies_df['tags'] = final_movies_df['tags'].str.lower()


In [13]:
def fetch_omdb_ratings(movie_title):
    time.sleep(0.5)
    url = f"https://www.omdbapi.com/?t={movie_title}&apikey={OMDB_API_KEY}"
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        
        # Extract IMDb rating
        imdb_score = data.get("imdbRating")  # Example: "8.3"
        
        # Extract Rotten Tomatoes score (if available)
        rt_score = None
        if "Ratings" in data:
            for rating in data["Ratings"]:
                if rating["Source"] == "Rotten Tomatoes":
                    rt_score = rating["Value"].strip('%')  # Example: "87"

        return pd.Series([imdb_score, int(rt_score) if rt_score else None])  # Convert RT score to int

    except requests.exceptions.RequestException as e:
        print(f"Error fetching OMDb ratings for {movie_title}: {e}")
        return pd.Series([None, None])  # Return None if there's an error

# Should do the below parsing through the year for each movie too!!!

Will have to do manually for the duplicates (there are only a few)

In [16]:
if os.path.exists(SAVE_FILE):
    movies_ratings = pd.read_csv(SAVE_FILE)
else:
    print("New file")
    movies_ratings = final_movies_df[['movie_id', 'title', 'overview', 'cast', 'crew', 'popularity_log_norm', 'year', 'tags']].copy()
    movies_ratings['imdb_score'] = None
    movies_ratings['rt_score'] = None

# Process each movie one by one
for index, row in movies_ratings.iterrows():
    if pd.notna(row["imdb_score"]):  # Skip if already fetched
        print(f"Skipping search for: {row['title']}")
        continue  

    print(f"🔍 Fetching ratings for: {row['title']}...")
   
    imdb, rt = fetch_omdb_ratings(row["title"])
    
    # Store results in DataFrame
    movies_ratings.at[index, "imdb_score"] = imdb
    movies_ratings.at[index, "rt_score"] = rt

    # Save after each successful request
    movies_ratings.to_csv(SAVE_FILE, index=False)
    
    print(f"✅ Saved: {row['title']} - IMDb: {imdb}, RT: {rt}%")

    time.sleep(1)  # Avoid hitting rate limits

print("🎉 Done! All available ratings saved.")

Skipping search for: Avatar
Skipping search for: Pirates of the Caribbean: At World's End
Skipping search for: Spectre
Skipping search for: The Dark Knight Rises
Skipping search for: John Carter
Skipping search for: Spider-Man 3
Skipping search for: Tangled
Skipping search for: Avengers: Age of Ultron
Skipping search for: Harry Potter and the Half-Blood Prince
Skipping search for: Batman v Superman: Dawn of Justice
Skipping search for: Superman Returns
Skipping search for: Quantum of Solace
Skipping search for: Pirates of the Caribbean: Dead Man's Chest
Skipping search for: The Lone Ranger
Skipping search for: Man of Steel
Skipping search for: The Chronicles of Narnia: Prince Caspian
Skipping search for: The Avengers
Skipping search for: Pirates of the Caribbean: On Stranger Tides
Skipping search for: Men in Black 3
Skipping search for: The Hobbit: The Battle of the Five Armies
Skipping search for: The Amazing Spider-Man
Skipping search for: Robin Hood
Skipping search for: The Hobbit: 

KeyboardInterrupt: 

In [25]:
movies_ratings = pd.read_csv(SAVE_FILE)

In [26]:
ps = PorterStemmer()
def stem(text):
    return " ".join([ps.stem(word) for word in text.split()])
    
movies_ratings['tags'] = movies_ratings['tags'].apply(stem)

In [27]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(movies_ratings['tags']).toarray()
similarity = cosine_similarity(vectors)

In [28]:
def get_top_recommendations(index):
    distances = similarity[index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:26]

    combined_scores = {}
    w_similarity = 0.9
    w_popularity = 0.1

    for i in movies_list:
        idx = i[0]
        sim_score = i[1]
        pop_score = movies_ratings.iloc[idx]['popularity_log_norm']

        combined_score = (w_similarity * sim_score) + (w_popularity * pop_score)
        combined_scores[idx] = combined_score

    sorted_combined_scores = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    top_10_indexes = [i[0] for i in sorted_combined_scores[:10]]

    return top_10_indexes

In [29]:
def handle_missing_values(x):
    if isinstance(x, (int, float)):  # If the value is numeric
        return 0 if np.isnan(x) else x  # Replace NaN with 0
    elif pd.isna(x) or x == '' or x is None:  # If the value is empty (NaN, None, empty string)
        return 0  # Replace empty or None with 0
    else:
        return x  # If the value is valid, leave it unchanged

In [30]:
movies_ratings['top_recommendations'] = [get_top_recommendations(i) for i in range(len(movies_ratings))]

In [None]:
movies_ratings['imdb_score'] = movies_ratings['imdb_score'].apply(handle_missing_values)
movies_ratings['rt_score'] = movies_ratings['rt_score'].apply(handle_missing_values)
movies_ratings[['movie_id', 'tags','title', 'overview', 'cast', 'crew', 'popularity_log_norm', 'year', 'imdb_score', 'rt_score', 'top_recommendations']].to_csv('processed_movies.csv', index=False)