In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Sample URL template for IMDb movie pages
base_url = "https://www.imdb.com/title/tt{:0>7}"  # Updated URL template with 7-digit IMDb ID

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36'
}

# Read the DataFrame from the provided CSV file containing IMDb movie data
df = pd.read_csv("https://raw.githubusercontent.com/Vatsal328/Movie-Recommendation-System/main/Dataset/links.csv")

# Create and open the CSV file in append mode to store scraped data
with open('scraped_data_imdb_new.csv', 'a', newline='', encoding='utf-8') as f:
    # Write the header row if the file is empty
    if f.tell() == 0:
        f.write('imdbId,Ratings,Popularity,User Reviews,Critic Reviews,Metascore\n')

    # Iterate through each IMDb ID in the DataFrame for scraping data
    for imdbId in df['imdbId'][8275:]:
        imdbId_str = str(imdbId).zfill(7)  # Ensure IMDb ID is 7 digits
        url = base_url.format(imdbId_str)
        page = requests.get(url, headers=headers)

        if page.status_code != 200:
            print(f"Error fetching data for IMDb ID {imdbId}. Status code: {page.status_code}")
            print(url)
            continue

        soup = BeautifulSoup(page.content, 'html.parser')

        ratings = None
        popularity = None
        user_reviews = None
        critic_reviews = None
        metascore = None

        # Extract ratings from the IMDb page
        data_rating = soup.find('span', attrs={'class': 'sc-bde20123-1 cMEQkK'})
        if data_rating:
            rating = data_rating.text.strip()
            ratings = rating

        # Extract popularity score from the IMDb page
        data_popularity = soup.find('div', {'data-testid': 'hero-rating-bar__popularity__score', 'class': 'sc-5f7fb5b4-1 fTREEx'})
        if data_popularity:
            popularity_score = data_popularity.text.strip().replace(',', '')  # Remove commas from popularity count
            popularity = popularity_score

        # Extract user reviews, critic reviews, and metascore from the IMDb page
        reviews = soup.findAll('span', {'class': 'score'})
        for i in range(len(reviews)):
            if reviews[i] == 'User reviews':
                user_reviews = reviews[i - 1]
            elif reviews[i] == 'Critic reviews':
                critic_reviews = reviews[i - 1]
            elif reviews[i] == 'Metascore':
                metascore = reviews[i - 1]

        # Write scraped data to the CSV file
        f.write(f'{imdbId},{ratings},{popularity},{user_reviews},{critic_reviews},{metascore}\n')

# Read the updated CSV file into a DataFrame for further analysis or verification
scraped_data = pd.read_csv('scraped_data_imdb_new.csv')
print(scraped_data.head())

# Check the shape of the scraped_data DataFrame
print("Shape of scraped_data:", scraped_data.shape)

For handling some unusal values(Refining Dataset)

In [None]:
url = "https://raw.githubusercontent.com/Vatsal328/Movie-Recommendation-System/main/Dataset/links.csv"

df = pd.read_csv(url)

# df.head()
url1 = "https://raw.githubusercontent.com/Vatsal328/Movie-Recommendation-System/main/Preprocessing%20code/IMDB%20data.csv"

df1 = pd.read_csv(url1)

# Function to convert 'k' or 'K' format to numerical value
def convert_to_numeric(value):
    if isinstance(value, str):
        value = value.upper()
        if 'K' in value:
            return int(float(value.replace('K', '')) * 1000)
        else:
            return int(value)
    elif isinstance(value, (int, float)):
        return int(value) if not np.isnan(value) else np.nan
    else:
        return np.nan

# Apply the function to the 'User Reviews' column
df1['User Reviews'] = df1['User Reviews'].apply(convert_to_numeric)

# Apply the function to the 'Critic Reviews' column
df1['Critic Reviews'] = df1['Critic Reviews'].apply(convert_to_numeric)

print(df1)

In [None]:
# Convert DataFrame to CSV file
df1.to_csv('converted_data.csv', index=False)

# Merge the data frames based on 'imdbId', adding missing rows with NaN values
merged_df = pd.merge(df, df1, on='imdbId', how='left')

# print(merged_df)

merged_df_no_duplicates = merged_df.drop_duplicates(subset=['imdbId'], keep='first')

print(merged_df_no_duplicates)

merged_df_no_duplicates.to_csv('merged_df_no_duplicates.csv', index=False)

In [1]:
#to handle duplicate columns not needed now

# url = "https://raw.githubusercontent.com/Vatsal328/Movie-Recommendation-System/main/Preprocessing%20code/IMDB%20data.csv"


# df = pd.read_csv(url)

# print(df.head())

# # Remove duplicated columns and rename them
# df = df.rename(columns={'movieId_x': 'movieId', 'tmdbId_x': 'tmdbId'})

# # Drop movieId_y and tmdbId_y columns
# df.drop(['movieId_y', 'tmdbId_y'], axis=1, inplace=True)

# print(df)