In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Function to scrape a movie's information
def scrape_movies(page_num):
    url = f"https://www.themoviedb.org/movie?page={page_num}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Lists to store data
    titles = []
    ratings = []
    genres = []
    dates = []
    urls = []

    # Scraping the title, ratings, date, url and genre of movies
    movie_items = soup.find_all('div', class_ = 'card style_1')
    for item in movie_items:        
        title = item.find('h2').text.strip()
        date = item.find('p').text.strip()
        rating = item.find('div', class_ = 'user_score_chart')['data-percent']
        movie_url = 'https://www.themoviedb.org' + item.find('a')['href'] + '?language=en-US'
    
        response2 = requests.get(movie_url)
        soup2 = BeautifulSoup(response2.text, 'html.parser')
        genres_list = [g.text.strip() for g in soup2.find_all('span', class_='genres')]
        genre_list = ','.join(genres_list)

        titles.append(title)
        ratings.append(rating)
        genres.append(genre_list)
        urls.append(url)
        dates.append(date)

    return pd.DataFrame({'Date': dates, 'Title': titles, 'Rating': ratings, 'Genres': genres, 'URL': urls})

# Scrape movies from multiple pages
all_movies = pd.DataFrame()
for page in range(1, 6):  # Scrape first 5 pages
    movies_on_page = scrape_movies(page)
    all_movies = pd.concat([all_movies, movies_on_page])

# Save the movie data to CSV
all_movies.to_csv('movies_data.csv', index=False)
print("Scraping done! Data saved to movies_data.csv")

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


df = pd.read_csv('movies_data.csv')
df.tail(10)

Unnamed: 0,Date,Title,Rating,Genres,URL
90,04 Nov 2016,A Street Cat Named Bob,79,"Family, Drama",https://www.themoviedb.org/movie?page=5
91,01 Apr 2022,Sonic the Hedgehog 2,75,"Action, Adventure, Family, Comedy",https://www.themoviedb.org/movie?page=5
92,18 Jan 2024,The Jack in the Box: Rises,69,Horror,https://www.themoviedb.org/movie?page=5
93,21 Okt 2023,"Miraculous World: Paris, Tales of Shadybug and...",72,"Animation, Adventure, Action, Fantasy, Family",https://www.themoviedb.org/movie?page=5
94,02 Jun 2023,Spider-Man: Across the Spider-Verse,84,"Animation, Action, Adventure, Science Fiction",https://www.themoviedb.org/movie?page=5
95,01 Mrt 2024,Dune: Part Two,82,"Science Fiction, Adventure",https://www.themoviedb.org/movie?page=5
96,20 Jul 2022,Alienoid,68,"Science Fiction, Action, Fantasy, Adventure",https://www.themoviedb.org/movie?page=5
97,26 Des 2024,Your Fault,0,"Drama, Romance",https://www.themoviedb.org/movie?page=5
98,24 Jan 2024,Beautiful Wedding,55,"Romance, Comedy, Drama",https://www.themoviedb.org/movie?page=5
99,15 Des 2021,Spider-Man: No Way Home,80,"Action, Adventure, Science Fiction",https://www.themoviedb.org/movie?page=5


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the scraped data
movies_df = pd.read_csv('movies_data.csv')

# Clean the 'Rating' column, convert it to numeric
movies_df['Rating'] = pd.to_numeric(movies_df['Rating'], errors='coerce').fillna(0)

# Combine 'Genres' and 'Rating' to create a content-based filtering system
movies_df['Genres_Rating'] = movies_df['Genres'] + ' ' + movies_df['Rating'].astype(str)

# Preview the data
movies_df.tail()

Unnamed: 0,Date,Title,Rating,Genres,URL,Genres_Rating
95,01 Mrt 2024,Dune: Part Two,82,"Science Fiction, Adventure",https://www.themoviedb.org/movie?page=5,"Science Fiction, Adventure 82"
96,20 Jul 2022,Alienoid,68,"Science Fiction, Action, Fantasy, Adventure",https://www.themoviedb.org/movie?page=5,"Science Fiction, Action, Fantasy, Adventure 68"
97,26 Des 2024,Your Fault,0,"Drama, Romance",https://www.themoviedb.org/movie?page=5,"Drama, Romance 0"
98,24 Jan 2024,Beautiful Wedding,55,"Romance, Comedy, Drama",https://www.themoviedb.org/movie?page=5,"Romance, Comedy, Drama 55"
99,15 Des 2021,Spider-Man: No Way Home,80,"Action, Adventure, Science Fiction",https://www.themoviedb.org/movie?page=5,"Action, Adventure, Science Fiction 80"


In [5]:
# Use TF-IDF to convert the Genres_Rating column into a matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['Genres_Rating'])

# Calculate cosine similarity between movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get movie recommendations based on similarity
def get_recommendations(movie_title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = movies_df[movies_df['Title'] == movie_title].index[0]
    
    # Get the similarity scores for all movies
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the top 10 most similar movies
    movie_indices = [i[0] for i in sim_scores[1:11]]
    
    # Return the top 10 most similar movies
    return movies_df['Title'].iloc[movie_indices]

# Test the recommendation system
recommended_movies = get_recommendations('Bad Boys: Ride or Die')  # Replace with a valid movie title from your dataset
print("Movies recommended for you:")
print("------------------------------")
print(recommended_movies)

Movies recommended for you:
------------------------------
91                                 Sonic the Hedgehog 2
51    Justice League: Crisis on Infinite Earths Part...
70                                               Fast X
85                            Beverly Hills Cop: Axel F
47                                      The Instigators
3                                           Rebel Ridge
20                                               Gunner
10                  Twilight of the Warriors: Walled In
39                                                  xXx
8                                            The Killer
Name: Title, dtype: object
