In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# --- 1. The Mock Artifacts: Creating our data ---

np.random.seed(42)  
duration = np.random.uniform(low=60, high=180, size=100) # Movie durations from 60 to 180 min
imdb_score = 4.0 + 0.03 * duration + np.random.normal(loc=0, scale=0.8, size=100) # Scores with a linear trend and noise

In [None]:
# Create a Pandas DataFrame to hold our data

movies_df = pd.DataFrame({'duration': duration, 
                          'imdb_score': imdb_score})
print("--- The First 5 Movies in Our Dataset ---")
print(movies_df.head())
print("\n" + "="*40 + "\n")

--- The First 5 Movies in Our Dataset ---
     duration  imdb_score
0  104.944814    7.217982
1  174.085717    8.983366
2  147.839273    8.508587
3  131.839018    6.365115
4   78.722237    6.185930




In [None]:
# --- 2. Preparing the Ritual: Splitting our data ---
X = movies_df[['duration']]
y = movies_df['imdb_score']

In [18]:
# The train_test_split() function 

X_train, X_test, y_train, y_test =\
train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} movies")
print(f"Testing set size: {X_test.shape[0]} movies")
print("\n" + "="*40 + "\n")

Training set size: 80 movies
Testing set size: 20 movies




In [None]:
# --- 3. The Grand Ritual: Training the Linear Regression Model ---
model = LinearRegression()

In [None]:
# We train our model using the .fit() method on our training data.
# The model is learning the best-fit line to describe the relationship
# between movie duration and IMDB score from this data.

print("Training the model...")
model.fit(X_train, y_train)
print("Training complete! The model has found its prophecy line.")
print("\n" + "="*40 + "\n")

Training the model...
Training complete! The model has found its prophecy line.




In [None]:
# --- 4. The First Prophecy: Making a prediction ---
y_pred = model.predict(X_test)
print("--- The Model's Prophecies vs. The Real Scores ---")
for i in range(5):
    print(f"Movie duration: {X_test.iloc[i].values[0]:.2f} min")
    print(f"  Model's Prophecy (Predicted Score): {y_pred[i]:.2f}")
    print(f"  True Score: {y_test.iloc[i]:.2f}\n")

--- The Model's Prophecies vs. The Real Scores ---
Movie duration: 67.63 min
  Model's Prophecy (Predicted Score): 6.12
  True Score: 6.55

Movie duration: 167.38 min
  Model's Prophecy (Predicted Score): 8.85
  True Score: 8.62

Movie duration: 152.67 min
  Model's Prophecy (Predicted Score): 8.45
  True Score: 7.59

Movie duration: 139.50 min
  Model's Prophecy (Predicted Score): 8.09
  True Score: 7.46

Movie duration: 91.05 min
  Model's Prophecy (Predicted Score): 6.76
  True Score: 7.36



In [27]:
#5. Evaluating the Prophecy's Accuracy
#We use a metric called Mean Squared Error (MSE) to quantify
mse = mean_squared_error(y_test, y_pred)
print(f"The model's overall prediction error (Mean Squared Error): {mse:.2f}")

# The model's prophecy line can be described by an intercept and a coefficient.
print(f"The model's formula is approximately: IMDB_Score = {model.intercept_:.2f} + {model.coef_[0]:.2f} * duration")

The model's overall prediction error (Mean Squared Error): 0.42
The model's formula is approximately: IMDB_Score = 4.27 + 0.03 * duration
