This function processes and merges the Netflix dataset's combined_data files (1 to 4) into a single CSV file

In [1]:

import pandas as pd

# Define the output file name and the column names.
output_file = "merged_netflix_data.csv"
column_names = ["User_ID", "Rating", "Date"]
# Open the output file in write mode.
with open(output_file, "w") as output:
    for i in range(1, 5):
        file_name = f"combined_data_{i}.txt"
        print(f"Processing {file_name}")
        with open(file_name, "r") as file:
            movie_id = None
            for line in file:
                # If the line contains a colon, it is a movie ID. Remove the colon and store the movie ID.
                if ":" in line:
                    movie_id = line.strip()[:-1]
                    # If the line does not contain a colon, it is a data line containing user ID, rating, and date. Split the line by commas and store the values in respective variables.
                else:
                    user_id, rating, date = line.strip().split(",")
                    output.write(f"{movie_id},{user_id},{rating},{date}\n")

print("Merging completed!")

Processing combined_data_1.txt
Processing combined_data_2.txt
Processing combined_data_3.txt
Processing combined_data_4.txt
Merging completed!


Pre process the movie_titles.csv to remove extra fields and creates a new file movie_titles_processed.csv

In [5]:
import csv

# Open the original file and create a new file for writing
with open('movie_titles.csv', 'r') as csv_file, open('movie_titles_processed.csv', 'w', newline='') as new_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    csv_writer = csv.writer(new_file, delimiter=',')
    
    # Write the header to the new file
    csv_writer.writerow(['MovieID', 'Year', 'Title'])
    
    # Loop through each row in the original file
    for row in csv_reader:
        
        # If the row has more than three fields, remove the extra fields
        if len(row) > 3:
            row = [row[0], row[1], ','.join(row[2:])]
        
        # Write the row to the new file
        csv_writer.writerow(row)


This function is responsible for loading and preprocessing the movie titles and IMDb scraped data, and then merging them into a single DataFrame. The purpose is to create a comprehensive dataset that combines movie titles from the Netflix dataset with additional information from the IMDb dataset (such as genre, keywords, and aggregateRating).

In [23]:
import pandas as pd
import json

# Load movie_titles.csv
movie_titles = pd.read_csv("movie_titles_processed.csv", encoding="ISO-8859-1", header=0, names=["MovieID", "Year", "Title"])

# Load imdbscrapeddata.json
with open("imdbscrapeddata.json", "r") as file:
    imdb_scraped_data = json.load(file)

# Convert JSON data to a DataFrame
imdb_scraped_data_df = pd.DataFrame([imdb_scraped_data])

# Preprocessing and extracting relevant features from the IMDb data

# Extracting genre, keywords, and aggregateRating
imdb_scraped_data_df = imdb_scraped_data_df[["name", "genre", "keywords", "aggregateRating"]]

# Merge movie_titles DataFrame with imdb_scraped_data_df. Now contains the movie titles from the Netflix dataset with additional infomation from the IMDB dataset (genre, keywords, and aggregateRating).
merged_data = movie_titles.merge(imdb_scraped_data_df, left_on="Title", right_on="name", how="left")


This function is responsible for loading the merged Netflix data, converting the 'Date' column to a DateTime object, merging the Netflix data with the previously created 'merged_data' DataFrame (which contains movie titles along with IMDb information), and printing the first few rows of the final merged dataset.

In [22]:
# Load the merged Netflix data into a DataFrame
netflix_data = pd.read_csv("merged_netflix_data.csv", header=None, names=["MovieID", "User_ID", "Rating", "Date"])

# Convert the 'Date' column to a DateTime object
netflix_data['Date'] = pd.to_datetime(netflix_data['Date'])

# Merge the netflix_data DataFrame with the merged_data DataFrame
final_data = netflix_data.merge(merged_data, on="MovieID", how="left")

# Print the final merged dataset
print(final_data.head())


   MovieID  User_ID  Rating       Date    Year            Title name genre  \
0        1  1488844       3 2005-09-06  2003.0  Dinosaur Planet  NaN   NaN   
1        1   822109       5 2005-05-13  2003.0  Dinosaur Planet  NaN   NaN   
2        1   885013       4 2005-10-19  2003.0  Dinosaur Planet  NaN   NaN   
3        1    30878       4 2005-12-26  2003.0  Dinosaur Planet  NaN   NaN   
4        1   823519       3 2004-05-03  2003.0  Dinosaur Planet  NaN   NaN   

  keywords aggregateRating  
0      NaN             NaN  
1      NaN             NaN  
2      NaN             NaN  
3      NaN             NaN  
4      NaN             NaN  
