## Data Preprocessing

In [None]:
import pandas as pd
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
import pymysql
from os import environ
from dotenv import load_dotenv
from flask_sqlalchemy import SQLAlchemy
from flask import Flask
import sys
sys.path.append('Website')  # Add the Website directory to the system path

# Ensure the models module is installed
%pip install -e Website

# Import models
from Website.models import Movie, Genre, Actor, Director, MovieGenre, MovieActor, MovieDirector

Obtaining file:///C:/Users/user/Desktop/UOW/Internet%20and%20Web%20Development/Assignment/WebsiteNote: you may need to restart the kernel to use updated packages.



ERROR: file:///C:/Users/user/Desktop/UOW/Internet%20and%20Web%20Development/Assignment/Website does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.

[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


**Helper Functions**

In [None]:
# Load environment variables from .env file
load_dotenv()

# Define the clean_and_split function
def clean_and_split(input_string):
    if pd.isna(input_string):
        return []
    return [item.strip() for item in input_string.replace('[', '').replace(']', '').replace("'", "").replace("\"", "").split(',')]

# Define the insert_unique_records function
# Define the insert_unique_records function
def insert_unique_records(session, model, items):
    unique_items = set(items)
    existing_items = {item.name for item in session.query(model).all()}
    new_items = unique_items - existing_items
    for item_name in new_items:
        try:
            session.add(model(name=item_name.strip()))
            session.commit()
        except Exception as e:
            session.rollback()
            print(f"Error inserting {item_name}: {e}")

# Define the create_database_if_not_exists function
def create_database_if_not_exists():
    # Load environment variables from .env file
    load_dotenv()
    connection = pymysql.connect(
        host=environ.get("DB_HOST"),
        user=environ.get("DB_USER"),
        password=environ.get("DB_PASSWORD")
    )
    cursor = connection.cursor()
    cursor.execute(f"CREATE DATABASE IF NOT EXISTS moviedb")
    cursor.close()
    connection.close()

# Create the database if it does not exist
create_database_if_not_exists()

# Create a new Flask app and SQLAlchemy instance
app = Flask(__name__)
app.config['SQLALCHEMY_DATABASE_URI'] = environ.get('DATABASE_URI')
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
db = SQLAlchemy(app)

In [None]:
# Create the necessary tables if they do not exist
with app.app_context():
    db.create_all()

# Create a new session
engine = create_engine('mysql+pymysql://root:password@localhost:3306/moviedb')  # Replace with your actual database URI
Session = sessionmaker(bind=engine)
session = Session()

In [None]:
#Load CSV file 
movies_df = pd.read_csv("IMDb_Dataset_Edited.csv")

# Drop rows where "genres_list" contains "Unknown"
movies_df = movies_df[movies_df["genres_list"].apply(lambda x: "Unknown" not in x)]
movies_df = movies_df[movies_df["Director"].apply(lambda x: "Unknown" not in x)]
movies_df.drop_duplicates(subset=["overview"], inplace=True)
movies_df.dropna(subset=["overview"], inplace=True)
movies_df.reset_index(drop=True, inplace=True)
movies_df.info()

1. Insert Genres and Actors (for deduplication)

In [None]:

# Replace NaN values with None
movies_df = movies_df.where(pd.notnull(movies_df), None)



# Prepare data for genres, actors, and directors
genres = [genre for genres in movies_df['genres_list'] for genre in clean_and_split(genres)]
actors = [actor for cast_list in movies_df['Cast_list'] for actor in clean_and_split(cast_list)]
directors = [director for director_list in movies_df['Director'] for director in clean_and_split(director_list)]

movie_genres = [(row['title'], genre) for index, row in movies_df.iterrows() for genre in clean_and_split(row['genres_list'])]
movie_actors = [(row['title'], actor) for index, row in movies_df.iterrows() for actor in clean_and_split(row['Cast_list'])]
movie_directors = [(row['title'], director) for index, row in movies_df.iterrows() for director in clean_and_split(row['Director'])]
    
# Insert unique records for each model
insert_unique_records(session, Genre, genres)
insert_unique_records(session, Actor, actors)
insert_unique_records(session, Director, directors)



2. Insert Movies and link relationship

In [None]:
# Insert Movies and link relationships
movies_to_insert = []
movie_genres_to_insert = []
movie_actors_to_insert = []
movie_directors_to_insert = []

for index, row in movies_df.iterrows():
    try:
        movie_title = row['title']
        existing_movie = session.query(Movie).filter_by(title=movie_title).first()
        if not existing_movie:
            movie = Movie(
                title=row['title'],
                overview=row['overview'],
                status=row['status'],
                release_year=row['release_year'],
                popularity=row['popularity'],
                vote_average=row['vote_average'],
                vote_count=row['vote_count'],
                adult=row['adult'],
                overview_sentiment=row['overview_sentiment'],
                all_combined_keywords=row['all_combined_keywords'],
                runtime=row['runtime'],
                production_countries=row['production_countries'],  # Corrected column name
                Star1=row['Star1'],
                Star2=row['Star2'],
                Star3=row['Star3'],
                Star4=row['Star4']
            )
            movies_to_insert.append(movie)
    except Exception as e:
        print(f"Error processing row {index}: {e}")

# Bulk insert movies
session.bulk_save_objects(movies_to_insert)
session.commit()

# Link relationships
for index, row in movies_df.iterrows():
    movie = session.query(Movie).filter_by(title=row['title']).first()
    if movie:
        try:
            # Link Genres
            genres_list = clean_and_split(row.get('genres_list', ''))
            for genre_name in genres_list:
                genre = session.query(Genre).filter_by(name=genre_name).first()
                if genre:
                    movie_genres_to_insert.append({'movie_id': movie.movie_id, 'genre_id': genre.genre_id})
                else:
                    print(f"Genre '{genre_name}' not found in database")

            # Link Actors
            cast_list = clean_and_split(row.get('Cast_list', ''))
            for actor_name in cast_list:
                actor = session.query(Actor).filter_by(name=actor_name).first()
                if actor:
                    movie_actors_to_insert.append({'movie_id': movie.movie_id, 'actor_id': actor.actor_id})
                else:
                    print(f"Actor '{actor_name}' not found in database")

            # Link Directors
            directors_list = clean_and_split(row.get('Director', ''))
            for director_name in directors_list:
                director = session.query(Director).filter_by(name=director_name).first()
                if director:
                    movie_directors_to_insert.append({'movie_id': movie.movie_id, 'director_id': director.director_id})
                else:
                    print(f"Director '{director_name}' not found in database")
        except Exception as e:
            print(f"Error linking relationships for movie {movie.title}: {e}")

# Function to insert unique records for association tables
def insert_unique_association_records(session, association_table, records):
    unique_records = set(tuple(record.items()) for record in records)
    existing_records = set(
        (record.movie_id, record.genre_id if hasattr(record, 'genre_id') else None,
         record.actor_id if hasattr(record, 'actor_id') else None,
         record.director_id if hasattr(record, 'director_id') else None)
        for record in session.query(association_table).all()
    )
    new_records = unique_records - existing_records
    for record in new_records:
        record_dict = dict(record)
        session.execute(association_table.insert().values(record_dict))
    session.commit()

# Insert unique records for association tables
insert_unique_association_records(session, MovieGenre, movie_genres_to_insert)
insert_unique_association_records(session, MovieActor, movie_actors_to_insert)
insert_unique_association_records(session, MovieDirector, movie_directors_to_insert)

# Verify data insertion
print("Movies:", session.query(Movie).all())
print("Genres:", session.query(Genre).all())
print("Actors:", session.query(Actor).all())
print("Directors:", session.query(Director).all())
print("MovieGenres:", session.query(MovieGenre).all())
print("MovieActors:", session.query(MovieActor).all())
print("MovieDirectors:", session.query(MovieDirector).all())

# Close the session
session.close()
