In [1]:
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Step 1: Load Data

In [3]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Step 1: Fetch the Web Page
url = "https://www.the-numbers.com/movie/budgets/all"
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)

# Step 2: Parse HTML
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find("table")

# Step 3: Extract Column Names
headers = [th.text.strip() for th in table.find_all("th")]

# Step 4: Extract Table Rows
data = []
for row in table.find_all("tr")[1:]:  # Skip header row
    cols = [td.text.strip() for td in row.find_all("td")]
    if len(cols) == len(headers):  # Ensure row length matches headers
        data.append(cols)

# Step 5: Convert to DataFrame
df = pd.DataFrame(data, columns=headers)

# Display first few rows
df.head()


Unnamed: 0,Unnamed: 1,ReleaseDate,Movie,ProductionBudget,DomesticGross,WorldwideGross
0,1,"Dec 16, 2015",Star Wars Ep. VII: The Force Awakens,"$533,200,000","$936,662,225","$2,056,046,835"
1,2,"Apr 23, 2019",Avengers: Endgame,"$400,000,000","$858,373,000","$2,748,242,781"
2,3,"Dec 9, 2022",Avatar: The Way of Water,"$400,000,000","$684,075,767","$2,320,250,281"
3,4,"May 21, 2025",Mission: Impossible—The Final Reckoning,"$400,000,000",$0,$0
4,5,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$379,000,000","$241,071,802","$1,045,713,802"


In [12]:
# Step 6: Data Cleaning
production_budget = df["ProductionBudget"].replace('[\\$,]', '', regex=True).astype(float)
worldwide_gross = df["WorldwideGross"].replace('[\\$,]', '', regex=True).astype(float)
profit = worldwide_gross - production_budget
print(worldwide_gross)

0     2.056047e+09
1     2.748243e+09
2     2.320250e+09
3     0.000000e+00
4     1.045714e+09
          ...     
95    2.104698e+08
96    2.195355e+08
97    2.714675e+08
98    1.858842e+08
99    4.071508e+08
Name: WorldwideGross, Length: 100, dtype: float64


In [16]:
# Step 7: Define X and Y
X = production_budget # Feature
y = profit  # Target

# Step 8: Split into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.head(), X_test.head(), y_train.head(), y_test.head())

55    210000000.0
88    200000000.0
26    250000000.0
42    225000000.0
69    200000000.0
Name: ProductionBudget, dtype: float64 83    200000000.0
53    210000000.0
70    200000000.0
45    225000000.0
44    225000000.0
Name: ProductionBudget, dtype: float64 55    1.124590e+08
88    1.935833e+08
26    5.955558e+08
42    1.290100e+09
69    7.522250e+08
dtype: float64 83    3.601554e+08
53    8.940541e+08
70    1.014631e+09
45    1.923413e+08
44    4.429995e+08
dtype: float64


In [None]:
# Step 9: Train Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 10: Evaluate Model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")

# Step 11: Predict Profit for a $500,000 Budget
budget = np.array([[500000]])
predicted_profit = model.predict(budget)[0]
print(f"Predicted Profit for $500,000 budget: ${predicted_profit:,.2f}")

# Step 12: Visualization
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_test["Production Budget"], y=y_test, label="Actual", color="blue")
sns.scatterplot(x=X_test["Production Budget"], y=y_pred, label="Predicted", color="red")
plt.xlabel("Production Budget")
plt.ylabel("Profit")
plt.title("Production Budget vs Profit")
plt.legend()
plt.show()