In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

# Sample URL template for IMDb movie pages
base_url = "https://www.imdb.com/title/tt{:0>7}"  # Updated URL template with 7-digit IMDb ID

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36'
}

# Read the DataFrames
tmdbid = pd.read_csv("https://raw.githubusercontent.com/Vatsal328/Movie-Recommendation-System/main/Dataset/links.csv")
movieid = pd.read_csv("https://raw.githubusercontent.com/Vatsal328/Movie-Recommendation-System/main/Dataset/movies.csv")

# Merge the DataFrames on the common column "movieId"
merged_df = pd.merge(movieid, tmdbid, on='movieId', how='inner')

# Function to remove '(xxxx)' part from the end of the title
def remove_year_from_title(title):
    return re.sub(r'\s*\([^)]*\)$', '', title)

# Apply the function to remove '(xxxx)' part from the end of the title
merged_df['title'] = merged_df['title'].apply(remove_year_from_title)

# Convert title to lowercase and replace spaces with dashes
merged_df['title'] = merged_df['title'].str.lower().str.replace(' ', '-')

# Function to construct the link
def construct_link(row):
    tmdb_id = row['tmdbId']
    title = row['title']
    return f"https://www.themoviedb.org/movie/{tmdb_id}-{title}"

# Apply the function to each row to create the 'link' column
merged_df['link'] = merged_df.apply(construct_link, axis=1)

# Keep only 'movieId' and 'link' columns
df = merged_df[['movieId', 'link']]

# Create an empty DataFrame to store the data
data = pd.DataFrame(columns=['movieId', 'user_score', 'language', 'budget', 'revenue'])

# Iterate over the rows of the DataFrame
for index, row in merged_df.iterrows():
    movie_id = row['movieId']
    url = row['link']
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    # Fetch the webpage content with headers
    response = requests.get(url, headers=headers)
    # Check if request was successful
    if response.status_code != 200:
        continue

    # Initialize variables to hold extracted information
    user_score = None
    language = None
    budget = None
    revenue = None

    # Parse HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract language, budget, and revenue
    facts_section = soup.find('section', class_='facts left_column')
    if facts_section:
        for p_tag in facts_section.find_all('p'):
            # Extract Original Language
            if 'Original Language' in p_tag.text:
                language = p_tag.text.strip().split()[-1]
            # Extract Budget
            elif 'Budget' in p_tag.text:
                budget = p_tag.text.strip().split()[-1]
            # Extract Revenue
            elif 'Revenue' in p_tag.text:
                revenue = p_tag.text.strip().split()[-1]

    # Extract user score
    div_element = soup.find('div', class_='user_score_chart')
    if div_element:
        user_score = div_element['data-percent']

    # Append the extracted data to the DataFrame
    data.loc[index] = [movie_id, user_score, language, budget, revenue]
# Save the data to CSV
data.to_csv('data.csv', index=False)

# Print the DataFrame
print(data)


      movieId user_score language          budget          revenue
0           1         80  English  $30,000,000.00  $394,400,000.00
1           2         72  English  $65,000,000.00  $262,821,940.00
2           3         65  English  $25,000,000.00   $71,500,000.00
3           4         63  English  $16,000,000.00   $81,452,156.00
4           5         63  English               -   $76,594,107.00
...       ...        ...      ...             ...              ...
2898     3879         57  English  $40,000,000.00   $30,199,105.00
2899     3882         62  English  $28,000,000.00   $90,449,929.00
2900     3884         55  English               -                -
2901     3888         59  English               -                -
2902     3889         48  English  $25,000,000.00   $15,800,000.00

[2891 rows x 5 columns]
