In [25]:
import pandas as pd



# Read the CSV file
file_path = '../preped.csv'
df = pd.read_csv(file_path)

In [None]:
df.columns

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import joblib

# Select features and target
features = ['Hidden Gem Score', 'Runtime', 'Awards Received', 'Awards Nominated For',
           'Boxoffice', 'IMDb Votes', 'Minimum Age'] + \
           [col for col in df.columns if col in ['Action', 'Adventure', 'Animation', 
           'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 
           'Fantasy', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 
           'News', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']]

target = 'IMDb Score'

X = df[features]
scaler = StandardScaler()

In [None]:
# Define target columns
target_im = 'IMDb Score'
target_rt = 'Rotten Tomatoes Score'
target_mc = 'Metacritic Score'

# Prepare data for IMDb Score
y_im = df[target_im]
X_train_im, X_test_im, y_train_im, y_test_im = train_test_split(X, y_im, test_size=0.2, random_state=42)

# Prepare data for Rotten Tomatoes Score
y_rt = df[target_rt]
X_train_rt, X_test_rt, y_train_rt, y_test_rt = train_test_split(X, y_rt, test_size=0.2, random_state=42)

# Prepare data for Metacritic Score
y_mc = df[target_mc]
X_train_mc, X_test_mc, y_train_mc, y_test_mc = train_test_split(X, y_mc, test_size=0.2, random_state=42)

# Scale features (reusing existing scaler)
X_train_im_scaled = scaler.fit_transform(X_train_im)
X_test_im_scaled = scaler.transform(X_test_im)
X_train_rt_scaled = scaler.fit_transform(X_train_rt)
X_test_rt_scaled = scaler.transform(X_test_rt)
X_train_mc_scaled = scaler.fit_transform(X_train_mc)
X_test_mc_scaled = scaler.transform(X_test_mc)

# Train models
model_im = SVR(kernel='rbf', C=1.0, epsilon=0.1)
model_rt = SVR(kernel='rbf', C=1.0, epsilon=0.1)
model_mc = SVR(kernel='rbf', C=1.0, epsilon=0.1)

model_im.fit(X_train_im_scaled, y_train_im)
model_rt.fit(X_train_rt_scaled, y_train_rt)
model_mc.fit(X_train_mc_scaled, y_train_mc)

# Make predictions
y_pred_im = model_im.predict(X_test_im_scaled)
y_pred_rt = model_rt.predict(X_test_rt_scaled)
y_pred_mc = model_mc.predict(X_test_mc_scaled)

# Evaluate models
mae_im = mean_absolute_error(y_test_im, y_pred_im)
mse_im = mean_squared_error(y_test_im, y_pred_im)
r2_im = r2_score(y_test_im, y_pred_im)

mae_rt = mean_absolute_error(y_test_rt, y_pred_rt)
mse_rt = mean_squared_error(y_test_rt, y_pred_rt)
r2_rt = r2_score(y_test_rt, y_pred_rt)

mae_mc = mean_absolute_error(y_test_mc, y_pred_mc)
mse_mc = mean_squared_error(y_test_mc, y_pred_mc)
r2_mc = r2_score(y_test_mc, y_pred_mc)

# Print results
print("IMDb Metrics:")
print(f'Mean Absolute Error: {mae_im}')
print(f'Mean Squared Error: {mse_im}')
print(f'R^2 Score: {r2_im}')

print("\nRotten Tomatoes Score Metrics:")
print(f"Mean Absolute Error: {mae_rt}")
print(f"Mean Squared Error: {mse_rt}")
print(f"R^2 Score: {r2_rt}")

print("\nMetacritic Score Metrics:")
print(f"Mean Absolute Error: {mae_mc}")
print(f"Mean Squared Error: {mse_mc}")
print(f"R^2 Score: {r2_mc}")

In [None]:
import matplotlib.pyplot as plt

# Set up figure with 1x3 subplots
plt.figure(figsize=(20, 6))

# IMDb Score plot
plt.subplot(1, 3, 1)
plt.scatter(y_test_im, y_pred_im, alpha=0.6)
plt.plot([y_test_im.min(), y_test_im.max()], [y_test_im.min(), y_test_im.max()], 'r--', lw=2)
plt.xlabel('Actual IMDb Score')
plt.ylabel('Predicted IMDb Score')
plt.title('Actual vs. Predicted IMDb Scores')

# Rotten Tomatoes plot
plt.subplot(1, 3, 2)
plt.scatter(y_test_rt, y_pred_rt, alpha=0.6)
plt.plot([y_test_rt.min(), y_test_rt.max()], [y_test_rt.min(), y_test_rt.max()], 'r--', lw=2)
plt.xlabel('Actual Rotten Tomatoes Score')
plt.ylabel('Predicted Rotten Tomatoes Score')
plt.title('Actual vs. Predicted Rotten Tomatoes Scores')

# Metacritic plot
plt.subplot(1, 3, 3)
plt.scatter(y_test_mc, y_pred_mc, alpha=0.6)
plt.plot([y_test_mc.min(), y_test_mc.max()], [y_test_mc.min(), y_test_mc.max()], 'r--', lw=2)
plt.xlabel('Actual Metacritic Score')
plt.ylabel('Predicted Metacritic Score')
plt.title('Actual vs. Predicted Metacritic Scores')

plt.tight_layout()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.model_selection import HalvingGridSearchCV

# Define target columns
target_im = 'IMDb Score'
target_rt = 'Rotten Tomatoes Score'
target_mc = 'Metacritic Score'

targets = {
    target_im: {},
    target_rt: {},
    target_mc: {}
}

for target_name, target_data in targets.items():
    y = df[target_name]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale features
    scaler = StandardScaler() # Create scaler *inside* the loop for each target if needed. If X is the same for all, you can keep it outside
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    targets[target_name]['X_train_scaled'] = X_train_scaled
    targets[target_name]['X_test_scaled'] = X_test_scaled
    targets[target_name]['y_train'] = y_train
    targets[target_name]['y_test'] = y_test
    targets[target_name]['scaler'] = scaler # store the scaler for later use


# Define the parameter grid for GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],  # Example values for C
    'epsilon': [0.01, 0.1, 1],  # Example values for epsilon
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],  # Example values for kernel
    'gamma': ['scale', 'auto'] + [0.001, 0.01, 0.1, 1] # Example values for gamma
}

for target_name, target_data in targets.items():
    model = SVR()
    halving_search = HalvingGridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1, factor=2, min_resources=50)
    halving_search.fit(target_data['X_train_scaled'], target_data['y_train'])
    best_model = halving_search.best_estimator_

    # Make predictions using the best model
    y_pred = best_model.predict(target_data['X_test_scaled'])

    # Evaluate the best model
    mae = mean_absolute_error(target_data['y_test'], y_pred)
    mse = mean_squared_error(target_data['y_test'], y_pred)
    r2 = r2_score(target_data['y_test'], y_pred)

    # Print results
    print(f"\n{target_name} Metrics:")
    print(f'Best parameters: {halving_search.best_params_}')
    print(f'Mean Absolute Error: {mae}')
    print(f'Mean Squared Error: {mse}')
    print(f'R^2 Score: {r2}')
    
    targets[target_name]['best_model'] = best_model
    targets[target_name]['grid_search'] = halving_search # Store the grid search object if needed for more detailed analysis

#Example of using the trained models:
#print(targets[target_im]['best_model'].predict(targets[target_im]['scaler'].transform([[...some new data point...]]))) # Remember to scale new data points

In [None]:
import matplotlib.pyplot as plt

# Set up figure with 1x3 subplots
plt.figure(figsize=(20, 6))

target_names = [target_im, target_rt, target_mc]  # List of target names for easier looping

for i, target_name in enumerate(target_names):
    # Get the best model and data for the current target
    best_model = targets[target_name]['best_model']
    y_test = targets[target_name]['y_test']
    X_test_scaled = targets[target_name]['X_test_scaled']

    # Make predictions using the best model
    y_pred = best_model.predict(X_test_scaled)

    # Create the subplot
    plt.subplot(1, 3, i + 1)  # i+1 because subplot indexing starts from 1
    plt.scatter(y_test, y_pred, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel(f'Actual {target_name}')  # Use f-string for dynamic label
    plt.ylabel(f'Predicted {target_name}')
    plt.title(f'Actual vs. Predicted {target_name}s') # Added "s" for plural

plt.tight_layout()
plt.show() # added to show the plot.