# Prep

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Read the CSV file
file_path = '../../preped.csv'
df = pd.read_csv(file_path)

# Select the features and target variable
features = df[['Is Series', 'Hidden Gem Score', 'Runtime', 'Minimum Age', 'Awards Received', 'Awards Nominated For', 'Boxoffice', 'Release Date', 'IMDb Votes', 'Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']]
target = df['IMDb Score']

# Drop rows with missing values in features or target
features = features.dropna()
target = target[features.index]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Models

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


# Create and train the regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse:.3f}')
print(f'R-squared: {r2:.3f}')

In [None]:
# Define the target columns for Metacritic Score and Rotten Tomatoes Score
target_column_mc = 'Metacritic Score'
target_column_rt = 'Rotten Tomatoes Score'

# Separate the features and target variables for Metacritic Score
X_mc = features
y_mc = df[target_column_mc]

# Separate the features and target variables for Rotten Tomatoes Score
X_rt = features
y_rt = df[target_column_rt]

# Split the data into training and testing sets for Metacritic Score
X_train_mc, X_test_mc, y_train_mc, y_test_mc = train_test_split(X_mc, y_mc, test_size=0.2, random_state=42)

# Split the data into training and testing sets for Rotten Tomatoes Score
X_train_rt, X_test_rt, y_train_rt, y_test_rt = train_test_split(X_rt, y_rt, test_size=0.2, random_state=42)

# Create and train the regression model for Metacritic Score
model_mc = LinearRegression()
model_mc.fit(X_train_mc, y_train_mc)

# Create and train the regression model for Rotten Tomatoes Score
model_rt = LinearRegression()
model_rt.fit(X_train_rt, y_train_rt)

# Make predictions on the test set for Metacritic Score
y_pred_mc = model_mc.predict(X_test_mc)

# Make predictions on the test set for Rotten Tomatoes Score
y_pred_rt = model_rt.predict(X_test_rt)

# Evaluate the model for Metacritic Score
mse_mc = mean_squared_error(y_test_mc, y_pred_mc)
r2_mc = r2_score(y_test_mc, y_pred_mc)

# Evaluate the model for Rotten Tomatoes Score
mse_rt = mean_squared_error(y_test_rt, y_pred_rt)
r2_rt = r2_score(y_test_rt, y_pred_rt)

print(f'Mean Squared Error for Rotten Tomatoes Score: {mse_rt:.3f}')
print(f'R-squared for Rotten Tomatoes Score: {r2_rt:.3f}')
print("\n")
print(f'Mean Squared Error for Metacritic Score: {mse_mc:.3f}')
print(f'R-squared for Metacritic Score: {r2_mc:.3f}')

# Visualize Default Models

In [None]:
import matplotlib.pyplot as plt

# Create a figure with subplots
plt.figure(figsize=(15, 5))

# Plot IMDb scores
plt.subplot(131)
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual IMDb Score')
plt.ylabel('Predicted IMDb Score')
plt.title('IMDb Scores')

# Plot Metacritic scores 
plt.subplot(132)
plt.scatter(y_test_mc, y_pred_mc, alpha=0.6)
plt.plot([y_test_mc.min(), y_test_mc.max()], [y_test_mc.min(), y_test_mc.max()], 'r--', lw=2)
plt.xlabel('Actual Metacritic Score')
plt.ylabel('Predicted Metacritic Score')
plt.title('Metacritic Scores')

# Plot Rotten Tomatoes scores
plt.subplot(133)
plt.scatter(y_test_rt, y_pred_rt, alpha=0.6)
plt.plot([y_test_rt.min(), y_test_rt.max()], [y_test_rt.min(), y_test_rt.max()], 'r--', lw=2)
plt.xlabel('Actual Rotten Tomatoes Score') 
plt.ylabel('Predicted Rotten Tomatoes Score')
plt.title('Rotten Tomatoes Scores')

plt.tight_layout()
plt.show()