In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [14]:
# Load dataset
df = pd.read_csv("movie_summary.csv")

# Filter as previously done
df = df[
    (df['production_year'] > 2000) &
    (df['genre'] != "Reality") &
    (df['domestic_box_office'] < 500000000) &
    (df['international_box_office'] < 500000000) &
    df['genre'].notna() &
    df['domestic_box_office'].notna()
].copy()

# Feature Engineering
df['log_domestic_box_office'] = np.log1p(df['domestic_box_office'])
df['log_budget'] = np.log1p(df['production_budget'])

# Drop rows with missing engineered values
df = df.dropna(subset=['log_domestic_box_office', 'log_budget', 'genre','production_year','theatrical_engagements','creative_type'])

# Linear regression formula
formula = 'log_domestic_box_office ~ log_budget + C(genre) + production_year + theatrical_engagements + creative_type'

# Fit the model
model = smf.ols(formula, data=df).fit()

# Print the summary
print(model.summary())

                               OLS Regression Results                              
Dep. Variable:     log_domestic_box_office   R-squared:                       0.604
Model:                                 OLS   Adj. R-squared:                  0.603
Method:                      Least Squares   F-statistic:                     1117.
Date:                     Tue, 27 May 2025   Prob (F-statistic):               0.00
Time:                             14:09:29   Log-Likelihood:                -49120.
No. Observations:                    16876   AIC:                         9.829e+04
Df Residuals:                        16852   BIC:                         9.847e+04
Df Model:                               23                                         
Covariance Type:                 nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------

In [15]:
# Feature Engineering
df['log_international_box_office'] = np.log1p(df['international_box_office'])

df = df.dropna(subset=['log_international_box_office', 'log_budget', 'genre','production_year','theatrical_engagements','creative_type'])

# Linear regression formula
formula = 'log_international_box_office ~ log_budget + C(genre) + production_year + theatrical_engagements + creative_type'

# Fit the model
model = smf.ols(formula, data=df).fit()

# Print the summary
print(model.summary())

                                 OLS Regression Results                                 
Dep. Variable:     log_international_box_office   R-squared:                       0.328
Model:                                      OLS   Adj. R-squared:                  0.328
Method:                           Least Squares   F-statistic:                     358.4
Date:                          Tue, 27 May 2025   Prob (F-statistic):               0.00
Time:                                  14:09:34   Log-Likelihood:                -51666.
No. Observations:                         16876   AIC:                         1.034e+05
Df Residuals:                             16852   BIC:                         1.036e+05
Df Model:                                    23                                         
Covariance Type:                      nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-