### Installing Required Packages

In [None]:
!pip install thefuzz
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


### Loading Datasets

In [None]:
import kagglehub
import pandas as pd
from thefuzz import fuzz, process
from fuzzywuzzy import fuzz, process

#########################
####### Dowload #########
#########################
# dataset_path_metacritic = kagglehub.dataset_download("kashifsahil/16000-movies-1910-2024-metacritic")
# dataset_path_TMDB = kagglehub.dataset_download("asaniczka/tmdb-movies-dataset-2023-930k-movies")

# print("Path to Metacritic files:", dataset_path_metacritic)
# print("Path to TMDB files:", dataset_path_TMDB)



### Mounting Datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Path_Nikola_Metacritic = '/content/drive/MyDrive/UCSD_Fall_2024/ECE143/ECE143 - Final Project/Datasets/16k_Movies.csv'
Path_Nikola_TMDB = '/content/drive/MyDrive/UCSD_Fall_2024/ECE143/ECE143 - Final Project/Datasets/TMDB_movie_dataset_v11.csv'

# Obtaining csv files and converting them to dataframe
df_01 = pd.read_csv(Path_Nikola_Metacritic)
df_02 = pd.read_csv(Path_Nikola_TMDB)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### DataProcessing - Merging two datasets by mathing movie titles with fuzzy match

In [None]:
# Normalize titles in both datasets to lowercase for direct matching
df_01['Title_norm'] = df_01['Title'].str.lower()
df_02['Title_norm'] = df_02['title'].str.lower()

# Drop rows with NaN values in 'Title_norm' to avoid errors during matching
df_01 = df_01.dropna(subset=['Title_norm'])
df_02 = df_02.dropna(subset=['Title_norm'])

# Remove duplicate titles within each dataset before merging
df_01 = df_01.drop_duplicates(subset=['Title_norm'])
df_02 = df_02.drop_duplicates(subset=['Title_norm'])

# Merge datasets directly on the normalized title columns
merged_df = pd.merge(df_01, df_02, on='Title_norm', suffixes=('_df1', '_df2'))

# Drop the normalized title column if not needed for further analysis
merged_df = merged_df.drop(columns=['Title_norm'])
merged_df = merged_df.drop(columns=['Unnamed: 0'])
merged_df = merged_df.drop(columns=['Genres'])

merged_df = merged_df.rename(columns={'genres': 'Genres'})

# Convert 'Release Date' column to datetime, handling any errors by setting invalid dates to NaT (Not a Time)
merged_df['Release Date'] = pd.to_datetime(merged_df['Release Date'], errors='coerce')

# Remove any remaining duplicates in the merged DataFrame
merged_df = merged_df.drop_duplicates()

# Print the result
print(f"Merged {len(merged_df)} movies based on exact title matching")
print("Merged Movies DataFrame:\n", merged_df.head(10))

Merged 14010 movies based on exact title matching
Merged Movies DataFrame:
                Title Release Date  \
0  Three Colors: Red   1994-11-23   
1     The Conformist   1970-10-22   
2        Tokyo Story   1972-03-13   
3      The Godfather   1972-03-24   
4            Boyhood   2014-07-11   
5           Playtime   1973-06-27   
6    Army of Shadows   2006-04-28   
7          Moonlight   2016-10-21   
8    Pan's Labyrinth   2006-12-29   
9        Hoop Dreams   1994-10-14   

                                         Description  Rating  \
0  Krzysztof Kieslowski closes his Three Colors t...     8.3   
1  Set in Rome in the 1930s, this re-release of B...     7.3   
2  Yasujiro Ozu’s Tokyo Story follows an aging co...     8.1   
3  Francis Ford Coppola's epic features Marlon Br...     9.3   
4  Filmed over 12 years with the same cast, Richa...     7.5   
5  Monsieur Hulot curiously wanders around a high...     7.7   
6  Making its U.S. debut, Jean-Pierre Melville's ...     7.7   
7  M

In [None]:
class MovieDataset:
    def __init__(self, dataset):
        """
        Initialize the MovieDataset class by loading the dataset from the provided path.

        :param dataset_path: str, optional path to the downloaded dataset CSV file.
        """
        self.df = dataset

    def get_column_names(self):
        """
        Get a list of column names in the dataset.

        :return: list of column names
        """
        return self.df.columns.tolist()

    def get_movie_titles(self, num_titles=10):
        """
        Get a specified number of movie titles.

        :param num_titles: int, number of movie titles to return
        :return: list of movie titles
        """
        return self.df['Title'].head(num_titles).tolist()

    def filter_by_year(self, start_year, end_year):
        """
        Filter movies within a specified year range.

        :param start_year: int, start of the year range
        :param end_year: int, end of the year range
        :return: DataFrame containing movies in the specified year range
        """

        # Extract the release year from the 'Release Date' column
        self.df['Release Year'] = self.df['Release Date'].dt.year

        # Filter the dataset by year range
        return self.df[(self.df['Release Year'] >= start_year) & (self.df['Release Year'] <= end_year)]

    def get_movie_info(self, title):
        """
        Get detailed information about a movie by its title.

        :param title: str, the title of the movie
        :return: Series containing the movie details or None if not found
        """
        movie = self.df[self.df['Title'] == title]
        return movie.iloc[0] if not movie.empty else None


In [None]:
    # Example usage
    movies = MovieDataset(merged_df)

    # Display column names
    print("Column Names:", movies.get_column_names())

    # Display first 10 movie titles
    print("First 10 Movie Titles:", movies.get_movie_titles(num_titles=10))

    # Filter movies released between 2000 and 2010
    filtered_movies = movies.filter_by_year(2000, 2005)
    print("Movies from 2000 to 2010:\n", filtered_movies.head())

    # Get details for a specific movie
    movie_info = movies.get_movie_info("Inception")
    print("Movie Information:\n", movie_info)

### Save the merged data into csv

In [None]:
    merged_df.to_csv('Movies_Merged.csv', index=False)