## Data Preprocessing

In [2]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from os import path, environ
from dotenv import load_dotenv
from flask_sqlalchemy import SQLAlchemy
from Website import models


In [3]:
#Load CSV file 
movies_df = pd.read_csv("IMDb_Dataset_Edited.csv")
movies_df.info()

# Replace NaN values with appropriate defaults
movies_df.fillna({
    'overview': '',
    'status': '',
    'release_year': 0,
    'popularity': 0.0,
    'vote_average': 0.0,
    'vote_count': 0,
    'adult': False,
    'overview_sentiment': 0.0,
    'all_combined_keywords': '',
    'Star1': '',
    'Star2': '',
    'Star3': '',
    'Star4': ''
}, inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89538 entries, 0 to 89537
Data columns (total 39 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       89538 non-null  int64  
 1   title                    89538 non-null  object 
 2   vote_average             89538 non-null  float64
 3   vote_count               89538 non-null  int64  
 4   status                   89538 non-null  object 
 5   release_date             89538 non-null  object 
 6   revenue                  89538 non-null  int64  
 7   runtime                  89538 non-null  int64  
 8   adult                    89538 non-null  bool   
 9   budget                   89538 non-null  int64  
 10  imdb_id                  40169 non-null  object 
 11  original_language        89538 non-null  object 
 12  original_title           89538 non-null  object 
 13  overview                 85545 non-null  object 
 14  popularity            

In [4]:
#Connect to Database
load_dotenv()
engine = create_engine(environ.get("DATABASE_URI"))
Session = sessionmaker(bind=engine)
session = Session()

**Helper Functions**

In [5]:
# Clean and split data
def clean_and_split(data, delimiter=','):
    if pd.isna(data):
        return []
    return [item.strip() for item in data.replace('[', '').replace(']', '').replace("'", "").split(delimiter)]

# Insert unique Genres, Actors, and Directors
def insert_unique_records(session, model, names):
    unique_names = set(names)
    for name in unique_names:
        if name:  # Avoid inserting empty strings
            instance = model(name=name)
            session.add(instance)
    session.commit()

1. Insert Genres and Actors (for deduplication)

In [6]:
# # Insert Genres
# unique_genres = set(genre.strip() for genres in movies_df['genres_list'] if pd.notna(genres) for genre in genres.replace('[', '').replace(']', '').replace("'", "").split(', '))
# for genre_name in unique_genres:
#     genre = models.Genre(name=genre_name.strip())
#     session.add(genre)
# session.commit()

# # Clean and insert Actors
# unique_actors = set(actor.strip() for cast_list in movies_df['Cast_list'] if pd.notna(cast_list) for actor in cast_list.replace('[', '').replace(']', '').replace("'", "").split(', '))
# for actor_name in unique_actors:
#     actor = models.Actor(name=actor_name.strip())
#     session.add(actor)
# session.commit()


# # Insert Directors
# unique_directors = set(director.strip() for directors in movies_df['Director'] if pd.notna(directors) for director in directors.split(','))
# for director_name in unique_directors:
#     director = models.Director(name=director_name)
#     session.add(director)
# session.commit()

# Prepare data for genres, actors, and directors
genres = [genre for genres in movies_df['genres_list'] for genre in clean_and_split(genres)]
actors = [actor for cast_list in movies_df['Cast_list'] for actor in clean_and_split(cast_list)]
directors = [director for director_list in movies_df['Director'] for director in clean_and_split(director_list)]

# Insert unique records for each model
insert_unique_records(session, models.Genre, genres)
insert_unique_records(session, models.Actor, actors)
insert_unique_records(session, models.Director, directors)

# After inserting unique genres, actors, and directors, the code retrieves all entries and creates dictionaries (genre_map, actor_map, director_map) 
# These dictionaries allow quick lookups for each genre, actor, or director without querying the database for each row to improve performance
genre_map = {g.name: g for g in session.query(models.Genre).all()}
actor_map = {a.name: a for a in session.query(models.Actor).all()}
director_map = {d.name: d for d in session.query(models.Director).all()}

2. Insert Movies and link relationship

In [7]:
# Insert Movies and link relationships
for index, row in movies_df.iterrows():
    try:
        movie = models.Movie(
            title=row['title'],
            overview=row['overview'],
            status=row['status'],
            release_year=row['release_year'],
            popularity=row['popularity'],
            vote_average=row['vote_average'],
            vote_count=row['vote_count'],
            adult=row['adult'],
            overview_sentiment=row['overview_sentiment'],
            all_combined_keywords=row['all_combined_keywords'],
            Star1=row['Star1'],
            Star2=row['Star2'],
            Star3=row['Star3'],
            Star4=row['Star4']
        )
        session.add(movie)
        session.commit()

    #FOR ASSOCIATION RELATIONSHIP TABLES 
        # Link Genres
        for genre_name in clean_and_split(row['genres_list']):
            genre = genre_map.get(genre_name)
            if genre:
                movie.genres.append(genre)

        # Link Actors
        for actor_name in clean_and_split(row['Cast_list']):
            actor = actor_map.get(actor_name)
            if actor:
                movie.actors.append(actor)

        # Link Directors
        for director_name in clean_and_split(row['Director']):
            director = director_map.get(director_name)
            if director:
                movie.directors.append(director)
        
        session.bulk_save_objects(movie)
        session.commit()
        
    except Exception as e:
        session.rollback()
        print(f"Error processing row {index}: {e}")

# Close the session
session.close()


Error processing row 0: 'Movie' object is not iterable
Error processing row 1: 'Movie' object is not iterable
Error processing row 2: 'Movie' object is not iterable
Error processing row 3: 'Movie' object is not iterable
Error processing row 4: 'Movie' object is not iterable
Error processing row 5: 'Movie' object is not iterable
Error processing row 6: 'Movie' object is not iterable
Error processing row 7: 'Movie' object is not iterable
Error processing row 8: 'Movie' object is not iterable
Error processing row 9: 'Movie' object is not iterable
Error processing row 10: 'Movie' object is not iterable
Error processing row 11: 'Movie' object is not iterable
Error processing row 12: 'Movie' object is not iterable
Error processing row 13: 'Movie' object is not iterable
Error processing row 14: 'Movie' object is not iterable
Error processing row 15: 'Movie' object is not iterable
Error processing row 16: 'Movie' object is not iterable
Error processing row 17: 'Movie' object is not iterable
Er

KeyboardInterrupt: 