## Abstract
##### In this project, I explored the fascinating intersection of machine learning and film. <br> I built machine learning models and trained them on a collection of movies to assess how likely I am to re-watch some of my favorite films.

### Initial Data Load and Setup
##### Initialize Missing Values For the Following Columns: Re-Watch Desire, Wins, Losses, and Ties

In [1]:
# Import Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
import numpy as np
from imdb import IMDb

In [None]:
# Define Initialize Data Function
def Initialize_Data(filename):
    Data = pd.read_csv(filename)
    Data['Re-Watch Desire'] = 5.0
    Data['Wins'] = 0
    Data['Losses'] = 0
    Data['Ties'] = 0
    return Data

In [None]:
# Load Initial Set of Movies
Movies = Initialize_Data('Data/Movies List.csv')
Movies

### Pairwise Movie Comparison and Feature Engineering
##### Utilizing pairwise comparison and feature engineering, I created preference data and fields encompassing the following themes: <br> Genre, Runtime, Budget, and More!

In [None]:
# Define Pairwise Movie Comparison Function
def compare_movies(Movies, Start, End, Comparison_Results):

    Titles = list(Movies['Movie'][Start:End])
                  
    #  Generate All Pairs for Comparison
    for i, movie1 in enumerate(Titles[:-1]):
        for movie2 in Titles[i+1:]:
            
            Valid_Input = False
            while not Valid_Input:
            
                print(f'Comparison: {movie1} vs {movie2}')
                Choice = input('Which movie do you prefer? Enter 1 for the first, 2 for the second, or 3 for a tie: ')

                if Choice in ['1', '2', '3']:
                    Valid_Input = True
                    
                    if Choice == '1':
                        Movies.loc[Movies['Movie'] == movie1, 'Re-Watch Desire'] += .1
                        Movies.loc[Movies['Movie'] == movie2, 'Re-Watch Desire'] -= .1
                        Movies.loc[Movies['Movie'] == movie1, 'Wins'] += 1
                        Movies.loc[Movies['Movie'] == movie2, 'Losses'] += 1
                        Comparison_Result = f'{movie1} beat {movie2}'
                        print(f'Result: {movie1} wins this round!')
                
                    elif Choice == '2':
                        Movies.loc[Movies['Movie'] == movie1, 'Re-Watch Desire'] -= .1
                        Movies.loc[Movies['Movie'] == movie2, 'Re-Watch Desire'] += .1
                        Movies.loc[Movies['Movie'] == movie1, 'Losses'] += 1
                        Movies.loc[Movies['Movie'] == movie2, 'Wins'] += 1
                        Comparison_Result = f'{movie2} beat {movie1}'
                        print(f'Result: {movie2} wins this round!')
                    
                    elif Choice == '3':
                        Movies.loc[Movies['Movie'] == movie1, 'Ties'] += 1
                        Movies.loc[Movies['Movie'] == movie2, 'Ties'] += 1
                        Comparison_Result = f'{movie1} tied {movie2}'
                        print(f'Result: tie between {movie1} and {movie2}')
                    
                    Movies['Re-Watch Desire'] = Movies['Re-Watch Desire'].clip(0, 10)
                    Comparison_Results.append(Comparison_Result)

                else:
                    print('Invalid input. Please enter 1, 2, or 3.')
        
    return Movies, Comparison_Results

#### Call Pairwise Function and the Evaluate First Ten Movies

In [None]:
# Evaluate the First Ten Movies
Movies_Df, Comparison_Log = compare_movies(Movies, 0, 10, [])

#### Save Progress Made to Movies Dataset and Comparison Results

In [None]:
def save_progress(Movies, Comparison_Results, Movie_Filename='Data/movie_comparisons.csv', Results_Filename='Data/comparison_results.csv'):
    Movies.to_csv(Movie_Filename, index=False)
    Results_Df = pd.DataFrame(Comparison_Results, columns=['Comparison'])
    Results_Df.to_csv(Results_Filename, index=False)
    print(f'Progress saved to {Movie_Filename} and {Results_Filename}.')

In [None]:
save_progress(Movies_Df, Comparison_Log)

#### Load Progress Made to Movies Dataset and Comparison Results

In [None]:
def load_progress(Movie_Filename = 'Data/movie_comparisons.csv', Results_Filename = 'Data/comparison_results.csv'):
    try:
        Movies = pd.read_csv(Movie_Filename)
        Results_Df = pd.read_csv(Results_Filename)
        Comparison_Results = Results_Df['Comparison'].tolist()
        return Movies, Comparison_Results
    except:
        print(f'No Save file found. Starting Fresh.')
        return None, []

In [None]:
# Load the Movie_Comparisons and Comparison Results
load_progress();

#### Define How Movies will be Evaluated in Batches

In [None]:
def evaluate_movies_in_batches(Movies, Batch_Size, Comparison_Results):
    # Separate Evaluated and Non-Evaluated Movies Based on Whether They Have Any Wins, Losses or Ties
    Evaluated_Movies = Movies[(Movies['Wins'] > 0) | (Movies['Losses'] > 0) | (Movies['Ties'] > 0)]
    Non_Evaluated_Movies = Movies[(Movies['Wins'] == 0) & (Movies['Losses'] == 0) & (Movies['Ties'] == 0)]

    # Process Only One Batch of Non-Evaluated Movies
    Batch = Non_Evaluated_Movies.head(Batch_Size)
    if not Batch.empty:
        Start = Batch.index[0]
        End = Batch.index[-1] + 1
        Movies, Comparison_Results = compare_movies(Movies, Start, End, Comparison_Results)

        # Compare Evaluated Movies with Non-Evaluated Movies in the Current Batch
        for _, Row in Evaluated_Movies.iterrows():
            for _, Batch_Row in Batch.iterrows():
                Valid_Input = False
                while not Valid_Input:
                    print(f'Comparison: {Row['Movie']} vs {Batch_Row['Movie']}')
                    Choice = input('Which movie do you prefer? Enter 1 for the first, 2 for the second, or 3 for a tie: ')
                    
                    if Choice in ['1', '2', '3']:
                        Valid_Input = True

                        if Choice == '1':
                            Movies.loc[Movies['Movie'] == Row['Movie'], 'Re-Watch Desire'] += .1
                            Movies.loc[Movies['Movie'] == Batch_Row['Movie'], 'Re-Watch Desire'] -= .1
                            Movies.loc[Movies['Movie'] == Row['Movie'], 'Wins'] += 1
                            Movies.loc[Movies['Movie'] == Batch_Row['Movie'], 'Losses'] += 1
                            Comparison_Result = f'{Row['Movie']} beat {Batch_Row['Movie']}'
                            print(f'Result: {Row['Movie']} wins this round!')

                        elif Choice == '2':
                            Movies.loc[Movies['Movie'] == Row['Movie'], 'Re-Watch Desire'] -= .1
                            Movies.loc[Movies['Movie'] == Batch_Row['Movie'], 'Re-Watch Desire'] += .1
                            Movies.loc[Movies['Movie'] == Row['Movie'], 'Losses'] += 1
                            Movies.loc[Movies['Movie'] == Batch_Row['Movie'], 'Wins'] += 1
                            Comparison_Result = f'{Batch_Row['Movie']} beat {Row['Movie']}'
                            print(f'Result: {Batch_Row['Movie']} wins this round!')

                        elif Choice == '3':
                            Movies.loc[Movies['Movie'] == Row['Movie'], 'Ties'] += 1
                            Movies.loc[Movies['Movie'] == Batch_Row['Movie'], 'Ties'] += 1
                            Comparison_Result = f'{Row['Movie']} tied {Batch_Row['Movie']}'
                            print(f'Result: tie between {Row['Movie']} and {Batch_Row['Movie']}')

                        Movies['Re-Watch Desire'] = Movies['Re-Watch Desire'].clip(0, 10)
                        Comparison_Results.append(Comparison_Result)
                    else:
                        print('Invalid input. Please enter 1, 2, or 3.')

    return Movies, Comparison_Results

#### Compare Movies in Batches, Evaluating Five Movies at a Time

In [None]:
# Processing with Batch Size of 5
Batch_Size = 5
Movies, Comparison_Results = load_progress('Data/movie_comparisons.csv', 'Data/comparison_results.csv')
if Movies is None:
   Movies = pd.DataFrame(Data)
if not Comparison_Results:
   Comparison_Results = []
Movies, Comparison_Results = evaluate_movies_in_batches(Movies, Batch_Size=Batch_Size, Comparison_Results=Comparison_Results)
save_progress(Movies, Comparison_Results, 'Data/movie_comparisons.csv', 'Data/comparison_results.csv')

#### Scrape Movie Details from IMDb and Create New Features

In [None]:
def fetch_movie_details(input_file, output_file):
    
    # Create an Instance of the IMDb class
    ia = IMDb()

    # Load the CSV file with the Movie Comparisons
    Movie_Comparisons = pd.read_csv(input_file)

    # Extract the list of Movie Titles
    Movies = Movie_Comparisons['Movie'].tolist()

    # Initialize List to Store Movie Details
    Details = []

    # MPAA Rating to Numeric Scale Mapping
    MPAA_Scale = {
        'G': 1,
        'PG': 2,
        'PG-13': 3,
        'R': 4,
        'N/A': 0 # Absence of Rating
    }

    for Movie in Movies:
        Search = ia.search_movie(Movie)
        if Search:
            Movie_Id = Search[0].movieID
            Movie_Info = ia.get_movie(Movie_Id)

            # Extract Details
            Title = Movie_Info.get('title', 'N/A')
            Rating = Movie_Info.get('rating', 'N/A')
            Runtime = Movie_Info.get('runtime', ['N/A'])[0] # Runtime in Minutes
            Box_Office = Movie_Info.get('box office', {}).get('Cumulative Worldwide Gross', 'N/A')
            Genre = ', '.join(Movie_Info.get('genres', 'N/A'))

            # Extract MPAA Rating (Certificates)
            Certificates = Movie_Info.get('certificates', [])
            MPAA_Rating = 'N/A'
            MPAA_Numeric = 0 # Default to 0 if no rating is found
            for Certificate in Certificates:
                if 'United States' in Certificate:
                    MPAA_Rating = Certificate.split(':')[-1]
                    MPAA_Numeric = MPAA_Scale.get(MPAA_Rating, 0)
                    break

            # Extract Budget and Convert to Millions
            Budget_Str = Movie_Info.get('box office', {}).get('Budget', 'N/A')
            Budget_Millions = 'N/A'
            if Budget_Str != 'N/A' and Budget_Str.startswith('$'):
                Budget_Value = Budget_Str.replace('$', '').replace(' ', '').replace(',', '').split('(')[0]
                try:
                    if 'million' in Budget_Value.lower():
                        Budget_Millions = float(Budget_Value.lower().replace('million', '').strip())
                    elif 'thousand' in Budget_Value.lower():
                        Budget_Millions = float(Budget_Value.lower().replace('thousand', '').strip()) / 1000
                    else:
                        Budget_Millions = float(Budget_Value) / 1e6
                except ValueError:
                    Budget_Millions = 'N/A'

            # Determine if the Movie Won Any Awards
            Awards = Movie_Info.get('awards', {})
            Award_Winner = 1 if Awards else 0

            # Determine if the Movie was Filmed Outside of the United States and Canada
            Locations = Movie_Info.get('filming locations', [])
            Foreign_Film = 0
            for Location in Locations:
                if 'USA' not in Location and 'Canada' not in Location:
                    Foreign_Film = 1
                    break

            # Append the Movie Details to the List
            Details.append([Title, Rating, Box_Office, MPAA_Numeric, Award_Winner, Foreign_Film, Runtime, 
                            Budget_Millions, Genre])

    # Create a DataFrame with the Movie Details
    Movie_Details = pd.DataFrame(Details, columns=['Movie', 'IMDb Rating', 'Box Office', 'MPAA Numeric', 
                                                   'Award Winner', 'Foreign Film', 'Runtime (min)', 'Budget (In Millions)', 
                                                   'Genre'])

    # Split Genre into Separate Columns
    Genres_Split = Movie_Details['Genre'].str.get_dummies(sep=', ')

    # Combine Movie Details with the New Columns
    Modified_Data = pd.concat([Movie_Details, Genres_Split], axis=1)

    # Save the DataFrame to CSV file
    Modified_Data.to_csv(output_file, index=False)
    print(f'Movie details saved to {output_file}')

# Call Function to Scrape Movie Details
fetch_movie_details('Data/movie_comparisons.csv', 'Data/movie_details.csv')

##### Additional Data Cleaning was Done Afterwards to Ensure Movie Details are Accurate

#### Merge Movie Details Data with Rankings Data

In [None]:
# Load the Movies Ranking Data and Merge it with Movie Details
Movie_Details = pd.read_csv('Data/movie_details.csv')
Movie_Comparisons = pd.read_csv('Data/movie_comparisons.csv')

# Merging the Datasets on the Movie Field 
Merged_Data = pd.merge(Movie_Comparisons, Movie_Details, on='Movie', how='inner')

### Machine Learning
##### I prepared Random Forest and XGBoost models on the newly created preference data, evaluating their predictive accuracy. <br> I then evaluated these models to identify the best model to generate new predictions on additional movies, assessing the re-watch potential of each film.

In [None]:
# Select Initial Features
Initial_Features = [
    'IMDb Rating', 'Box Office (In Millions)', 'MPAA Numeric', 'Award Winner', 'Foreign Film', 'Runtime (min)', 
    'Budget (In Millions)', 'Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama', 
    'Family', 'Fantasy', 'History', 'Music', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 
    'Thriller', 'War'
]

X = Merged_Data[Initial_Features]
Y = Merged_Data['Re-Watch Desire']

#### Evaluate the Correlation between Features

In [None]:
# Define a function to identify strongly correlated features
def find_strong_correlations(Data, Threshold=0.7):
    # Calculate the correlation matrix
    Corr_Matrix = Data.corr()
    
    # Find pairs of features with correlation above the threshold
    Strong_Correlations = []
    for i in range(len(Corr_Matrix.columns)):
        for j in range(i):
            if abs(Corr_Matrix.iloc[i, j]) > Threshold:
                Strong_Correlations.append((Corr_Matrix.columns[i], Corr_Matrix.columns[j], Corr_Matrix.iloc[i, j]))
    
    # Return the strongly correlated features as a DataFrame
    return pd.DataFrame(Strong_Correlations, columns=['Feature 1', 'Feature 2', 'Correlation'])

# Example usage of the function
Strong_Correlations = find_strong_correlations(X, Threshold=0.7)

# Display the strong correlations DataFrame
print(Strong_Correlations)

![Alt text](Images/Correlations.png)

#### Update Features to Remove Strong Correlations

In [None]:
# Old Features (removed 'Adventure', 'Animation', and 'Family')

''' 
Initial_Features = [
    'IMDb Rating', 'Box Office (In Millions)', 'MPAA Numeric', 'Award Winner', 'Foreign Film', 'Runtime (min)', 
    'Budget (In Millions)', 'Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama', 
    'Family', 'Fantasy', 'History', 'Music', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 
    'Thriller', 'War'
]
'''

# Select Relevant Features (23 Features in Total)
Relevant_Features = [
    'IMDb Rating', 'Box Office (In Millions)', 'MPAA Numeric', 'Award Winner', 'Foreign Film', 'Runtime (min)',
    'Budget (In Millions)', 'Action', 'Biography', 'Comedy', 'Crime', 'Drama', 'Fantasy', 'History', 'Music',
    'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War'
]

X_Updated = Merged_Data[Relevant_Features]
Y = Merged_Data['Re-Watch Desire']

#### Prepare Random Forest Machine Learning Model

In [None]:
# Split the Data into Training and Testing sets
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X_Updated, Y, test_size=0.2, random_state=42)

# Set up the hyperparameter grid for tuning the Random Forest model
Param_Grid = {
    'n_estimators': [100, 500, 1000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the Random Forest model
RF_Model = RandomForestRegressor(random_state=42)

# Use GridSearchCV to find the best parameters
Grid_Search = GridSearchCV(estimator=RF_Model, param_grid=Param_Grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=2)
Grid_Search.fit(X_Train, Y_Train)

# Select the best estimator
Best_RF_Model = Grid_Search.best_estimator_

# Train the best model on the training data
Best_RF_Model.fit(X_Train, Y_Train)

# Make Predictions on the Test Set
Y_Pred = Best_RF_Model.predict(X_Test)

# Evaluate the Model
MAE = mean_absolute_error(Y_Test, Y_Pred)
RMSE = np.sqrt(mean_squared_error(Y_Test, Y_Pred))

print(f'Mean Absolute Error: {MAE}')
print(f'Root Mean Squared Error: {RMSE}')

# Evaluate if Overfitting Occurred (Training Data)
Y_Train_Pred = Best_RF_Model.predict(X_Train)
Train_MAE = mean_absolute_error(Y_Train, Y_Train_Pred)
Train_RMSE = np.sqrt(mean_squared_error(Y_Train, Y_Train_Pred))

print(f'Training MAE: {Train_MAE}')
print(f'Training RMSE: {Train_RMSE}')

# Cross-validation on training data to assess overfitting
CV_Scores = cross_val_score(Best_RF_Model, X_Train, Y_Train, cv=5, scoring='neg_mean_absolute_error')
Mean_CV_Score = -np.mean(CV_Scores)

print(f'Cross-Validation MAE: {Mean_CV_Score}')

##### Best Hyperparameters: {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}

In [None]:
'''
Random Forest:
Mean Absolute Error (MAE): 1.9467821317003164
Root Mean Squared Error (RMSE): 2.372680695943936
Training MAE: 1.8986799415670812
Training RMSE: 2.259468144727309
Cross-Validation MAE: 2.3795859173805205
'''

##### Save RF Model for Easy Access

In [None]:
joblib.dump(Best_RF_Model, 'Models/best_rf_model.pkl')

#### Prepare XGBoost Machine Learning Model

In [None]:
# Split the Data into Training and Testing sets
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X_Updated, Y, test_size=0.2, random_state=42)

# Set up the hyperparameter grid for tuning the XGBoost model
Param_Grid = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

# Initialize the XGBoost model
XGB_Model = xgb.XGBRegressor(random_state=42, objective='reg:squarederror')

# Use GridSearchCV to find the best parameters
Grid_Search = GridSearchCV(estimator=XGB_Model, param_grid=Param_Grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=2)
Grid_Search.fit(X_Train, Y_Train)

# Select the best estimator
Best_XGB_Model = Grid_Search.best_estimator_

# Train the best model on the training data
Best_XGB_Model.fit(X_Train, Y_Train)

# Make Predictions on the Test Set
Y_Pred = Best_XGB_Model.predict(X_Test)

# Evaluate the Model
MAE = mean_absolute_error(Y_Test, Y_Pred)
RMSE = np.sqrt(mean_squared_error(Y_Test, Y_Pred))

print(f'Mean Absolute Error: {MAE}')
print(f'Root Mean Squared Error: {RMSE}')

# Evaluate if Overfitting Occurred (Training Data)
Y_Train_Pred = Best_XGB_Model.predict(X_Train)
Train_MAE = mean_absolute_error(Y_Train, Y_Train_Pred)
Train_RMSE = np.sqrt(mean_squared_error(Y_Train, Y_Train_Pred))

print(f'Training MAE: {Train_MAE}')
print(f'Training RMSE: {Train_RMSE}')

# Cross-validation on training data to assess overfitting
CV_Scores = cross_val_score(Best_XGB_Model, X_Train, Y_Train, cv=5, scoring='neg_mean_absolute_error')
Mean_CV_Score = -np.mean(CV_Scores)

print(f'Cross-Validation MAE: {Mean_CV_Score}')

##### Best Hyperparameters: {'colsample_bytree': 1.0, 'gamma': 5, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 1, 'reg_lambda': 2, 'subsample': 0.6}

In [None]:
'''
XGBoost Results:
Mean Absolute Error: 1.9968243885040287
Root Mean Squared Error: 2.5532722662428746
Training MAE: 1.7757338047027589
Training RMSE: 2.107246647250733
Cross-Validation MAE: 2.343489589180265
'''

##### Save XGB Model for Easy Access

In [None]:
joblib.dump(Grid_Search.best_estimator_, 'Models/best_xgb_model.pkl')

#### Model Comparison

In [None]:
'''
Random Forest:
Mean Absolute Error (MAE): 1.9467821317003164
Root Mean Squared Error (RMSE): 2.372680695943936
Training MAE: 1.8986799415670812
Training RMSE: 2.259468144727309
Cross-Validation MAE: 2.3795859173805205

XGBoost Results:
Mean Absolute Error (MAE): 1.9968243885040287
Root Mean Squared Error (RMSE): 2.5532722662428746
Training MAE: 1.7757338047027589
Training RMSE: 2.107246647250733
Cross-Validation MAE: 2.343489589180265
'''

#### Takeaways From Model Comparison:

##### MAE & RMSE <br> Random Forest is performing better on the test set in terms of MAE and RMSE, suggesting it is better at generalizing to unseen data. <br> XGBoost has better training performance but might be slightly overfitting the data. <br><br> Cross-Validation MAE <br> XGBoost has a marginally better Cross-Validation MAE, indicating that it has better performance across different splits of the data.

##### Given that Random Forest has lower test error (MAE & RMSE), Random Forest is the better model in this case for generalization on unseen data. 

#### Scrape Details from IMDb for New Movies

In [None]:
# Scrape Movie Details: DiDi, Forrest Gump, American Psycho, Iron Man, American Beauty
fetch_movie_details('Data/new_movies.csv', 'Data/new_details.csv')

##### Additional Data Cleaning was Done Afterwards to Ensure Movie Details are Accurate

#### Apply the Random Forest Model to New Movies

In [None]:
# Load the trained Random Forest model from the saved file
Best_RF_Model = joblib.load('Models/best_rf_model.pkl')

# Load New Movie Details
New_Details = pd.read_csv('Data/new_details.csv')

# Use the model to predict the 'Re-Watch Desire' for the new movies
Rewatch_Predictions = Best_RF_Model.predict(New_Details.drop(columns=['Movie']))

# Create a new dataframe with only 'Movie' and 'Predicted Re-Watch Desire'
Results = New_Details[['Movie']].copy()
Results['Predicted Re-Watch Desire'] = Rewatch_Predictions

# Print the results
print(Results)

# Save the results to 'RF_Predictions.csv'
Results.to_csv('Data/RF_Predictions.csv', index=False) 

![Alt text](Images/RF_Predictions.png)

#### Perform Pairwise Comparison on New Movies to Figure out Actual Re-Watch Desire

In [None]:
# Processing with Batch Size of 5
Batch_Size = 5
Movies, Comparison_Results = load_progress('Data/new_movie_comparisons.csv', 'Data/comparison_results.csv')
if Movies is None:
   Movies = pd.DataFrame(Data)
if not Comparison_Results:
   Comparison_Results = []
Movies, Comparison_Results = evaluate_movies_in_batches(Movies, Batch_Size=Batch_Size, Comparison_Results=Comparison_Results)
save_progress(Movies, Comparison_Results, 'Data/new_movie_comparisons.csv', 'Data/comparison_results.csv')

#### Comparison Between Predicted Re-Watch Desire and Actual

In [None]:
# Load the Random Forest Predictions
Predictions_Data = pd.read_csv('Data/RF_Predictions.csv')

# Load the New Movie Comparisons Dataset
New_Movie_Comparisons = pd.read_csv('Data/new_movie_comparisons.csv')
New_Movie_Comparisons.rename(columns={'Re-Watch Desire': 'Actual Re-Watch Desire'}, inplace=True)

# Merge the New Movie Comparisons Dataset with the RF Predictions
Merged_Data = pd.merge(Predictions_Data, New_Movie_Comparisons, on='Movie')

# Round Both Predicted and Actual Values to one decimal place
Merged_Data['Actual Re-Watch Desire (Rounded)'] = Merged_Data['Actual Re-Watch Desire'].round(1)
Merged_Data['Predicted Re-Watch Desire (Rounded)'] = Merged_Data['Predicted Re-Watch Desire'].round(1)

# Calculate the Difference and round it to one decimal place
Merged_Data['Difference'] = (Merged_Data['Predicted Re-Watch Desire (Rounded)'] - 
                             Merged_Data['Actual Re-Watch Desire (Rounded)']).round(1)

# Adjust the Comparison Columns in a Pandas Table Format
Results_Comparison = Merged_Data[['Movie', 'Actual Re-Watch Desire (Rounded)', 'Predicted Re-Watch Desire (Rounded)', 'Difference']]

# Display the formatted table
Results_Comparison.head()

# Save the Comparison Result to a CSV File
Results_Comparison.to_csv('RF_Results_Comparison.csv', index=False)

![Alt text](Images/RF_Comparison.png)

#### Findings

##### Overall, the model performs reasonably well for most movies, with minimal differences in predictions for DiDi, Iron Man, and American Beauty. <br><br> The model struggles slightly with movies like Forest Gump and American Psycho, where it underestimates or overestimates the re-watch desire by about one point. <br><br> This indicates that the model may benefit from exposure to more movies like these to improve its prediction accuracy, especially for certain genres or types of films.

### Data Visualizations

#### Explore What Keeps Me Coming Back to my Favorite Films

In [None]:
# Load the Trained Random Forest Model From the Saved File
Best_RF_Model = joblib.load('Models/best_rf_model.pkl')

# Load Merged_Data
Merged_Data = pd.read_csv('Data/Merged_Data.csv')

# Extract only the Relevant_Features from the Dataset
Relevant_Features = [
    'IMDb Rating', 'Box Office (In Millions)', 'MPAA Numeric', 'Award Winner', 'Foreign Film', 'Runtime (min)',
    'Budget (In Millions)', 'Action', 'Biography', 'Comedy', 'Crime', 'Drama', 'Fantasy', 'History', 'Music',
    'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War'
]

Relevant_Data = Merged_Data[Relevant_Features]

# Define Importance and Feature Names
Importances = Best_RF_Model.feature_importances_
Feature_Names = Relevant_Data.columns

# Sort Features by Importance
Indices = np.argsort(Importances)

# Plot
plt.figure(figsize=(10, 6))
plt.title('Feature Importance in Re-Watch Predictions')
plt.barh(range(len(Indices)), Importances[Indices], color='#000080', align='center')
plt.yticks(range(len(Indices)), [Feature_Names[i] for i in Indices])
plt.xlabel('Relative Importance')
plt.show()

![Alt text](Images/Feature_Importance_in_Re-Watch_Predictions.png)

#### Visualization Findings

* **IMDb Rating is the most significant factor in predicting Re-Watch Desire, suggesting high ratings align with high Re-Watch Desire**
* **Runtime plays a significant role, suggesting a potential interest in watching longer movies rather shorter movies**
* **Box Office Earnings suggests a stronger Re-Watch Desire for movies that performed well in theaters and had mass appeal**
* **The Comedy genre is a significant feature, suggesting that humor is a key driver of Re-Watch Desire**
* **Award Wins and Budget show moderate importance, suggesting critically acclaimed high-budget films are more likely to be re-watched**
<br><br>

In [None]:
# Load Final_Movies_Data
Final_Movies_Data = pd.read_csv('Data/Final_Movies_Data.csv')

# Define genre columns
genre_columns = ['Action', 'Biography', 'Comedy', 'Crime', 'Drama', 'Fantasy', 'History', 
                 'Music', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War']

# Calculate the count of movies in each genre
genre_counts = Final_Movies_Data[genre_columns].sum()

# We melt the data to transform it into a format where we can plot genre vs. re-watch desire
melted_data = Final_Movies_Data.melt(id_vars=['Re-Watch Desire'], value_vars=genre_columns, 
                                     var_name='Genre', value_name='Is_Genre')

# Filter only for rows where the genre is relevant (Is_Genre == 1)
filtered_data = melted_data[melted_data['Is_Genre'] == 1]

# Ensure genres in 'filtered_data' are sorted in the same order as genre_counts
filtered_data['Genre'] = pd.Categorical(filtered_data['Genre'], categories=genre_counts.index, ordered=True)

# Create the boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(x='Genre', y='Re-Watch Desire', data=filtered_data, color='#000080', order=genre_counts.index)

# Add the number of movies per genre to the x-axis labels
new_labels = [f'{genre} ({int(genre_counts[genre])})' for genre in genre_counts.index]
plt.xticks(range(len(new_labels)), new_labels, rotation=45)

# Customizing the plot
plt.title('Re-Watch Desire Distribution by Genre', fontsize=14)
plt.xlabel('Genre', fontsize=12)
plt.ylabel('Re-Watch Desire', fontsize=12)
plt.show()

![Alt text](Images/Re-Watch_Desire_Distribution_by_Genre.png)

#### Visualization Findings

**Note: One Movie Can Fit into Multiple Genres**

* **Romance, Crime, and Drama genres exhibit higher medians, suggesting these genres are more likely to be re-watched**
* **Mystery has the highest median, but there are limited movies fitting this genre type in my collection**
* **History and Biography films have lower medians, suggesting these genres are less likely to be re-watched**
* **Biography, Fantasy, and Sci-Fi films have wide ranges in Re-Watch Desire, indicating these genres evoke a wide range of preferences**
* **Mystery and Sport genres have the least variability, but there are also less movies fitting these genre types**
<br><br>

In [None]:
# Load the dataset
Final_Movies_Data = pd.read_csv('Data/Final_Movies_Data.csv')

# Define Pie Chart Colors
Blue_Shades = ['#73C2FB', '#C6E6FB', '#007FFF', '#CCCCFF']

# Group by 'MPAA Numeric' and calculate the mean of 'Re-Watch Desire' for each MPAA Numeric category
MPAA_AVG_Re_Watch = Final_Movies_Data.groupby('MPAA Numeric')['Re-Watch Desire'].mean()

# Replace the MPAA Numeric values with their corresponding labels (G, PG, PG-13, R)
MPAA_Labels = {1: 'G', 2: 'PG', 3: 'PG-13', 4: 'R'}
MPAA_AVG_Re_Watch_Labeled = MPAA_AVG_Re_Watch.rename(index=MPAA_Labels)

# Calculate how many movies fit into each MPAA category, using the same order as MPAA_AVG_Re_Watch
MPAA_Movie_Count = Final_Movies_Data['MPAA Numeric'].map(MPAA_Labels).value_counts().reindex(MPAA_AVG_Re_Watch_Labeled.index)

# Create the pie chart with numeric values for the average Re-Watch Desire
plt.figure(figsize=(8, 6))
plt.pie(MPAA_AVG_Re_Watch_Labeled, 
        labels=[f'{label} ({count})' for label, count in MPAA_Movie_Count.items()],
        autopct=lambda p: f'{p * MPAA_AVG_Re_Watch_Labeled.sum() / 100:.2f}', 
        colors=Blue_Shades)

plt.title('Average Re-Watch Desire by MPAA Rating', fontsize=14)
plt.show()

![Alt text](Images/Average_Re-Watch_Desire_by_MPAA_Rating.png)

#### Visualization Findings

* **R-Rated films have highest Re-Watch Desire, suggesting more mature content tends to be re-watched**
* **G-Rated films have second highest Re-Watch Desire, indicating family-friendly or universally accessible films are still enjoyed**
* **PG-13 movies have a moderate Re-Watch Desire, possibly explained by these movies catering to audiences of a wider variety**
* **PG movies have the lowest Re-Watch Desire, while intended for a broad audience, these films might be less engaging**
* **General Implications: this data shows that more mature content tends to draw higher Re-Watch Desire**
<br><br>

![Alt text](Images/Final_Movie_Re_Watch.png)