In [6]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [7]:
# Load dataset
df = pd.read_csv("movie_summary.csv")

# Filter as previously done
df = df[
    (df['production_year'] > 2000) &
    (df['genre'] != "Reality") &
    df['genre'].notna() &
    df['domestic_box_office'].notna()
].copy()

# Feature Engineering
df['log_domestic_box_office'] = np.log1p(df['domestic_box_office'])
df['log_budget'] = np.log1p(df['production_budget'])

# Drop rows with missing engineered values
df = df.dropna(subset=['log_domestic_box_office', 'log_budget', 'genre'])

# Linear regression formula
formula = 'log_domestic_box_office ~ log_budget + C(genre)'

# Fit the model
model = smf.ols(formula, data=df).fit()

# Print the summary
print(model.summary())

                               OLS Regression Results                              
Dep. Variable:     log_domestic_box_office   R-squared:                       0.430
Model:                                 OLS   Adj. R-squared:                  0.429
Method:                      Least Squares   F-statistic:                     1101.
Date:                     Mon, 26 May 2025   Prob (F-statistic):               0.00
Time:                             13:02:09   Log-Likelihood:                -58650.
No. Observations:                    19014   AIC:                         1.173e+05
Df Residuals:                        19000   BIC:                         1.174e+05
Df Model:                               13                                         
Covariance Type:                 nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------

In [8]:
# Feature Engineering
df['log_international_box_office'] = np.log1p(df['international_box_office'])

df = df.dropna(subset=['log_international_box_office', 'log_budget', 'genre'])

# Linear regression formula
formula = 'log_international_box_office ~ log_budget + C(genre)'

# Fit the model
model = smf.ols(formula, data=df).fit()

# Print the summary
print(model.summary())

                                 OLS Regression Results                                 
Dep. Variable:     log_international_box_office   R-squared:                       0.217
Model:                                      OLS   Adj. R-squared:                  0.217
Method:                           Least Squares   F-statistic:                     405.2
Date:                          Mon, 26 May 2025   Prob (F-statistic):               0.00
Time:                                  13:02:09   Log-Likelihood:                -59725.
No. Observations:                         19014   AIC:                         1.195e+05
Df Residuals:                             19000   BIC:                         1.196e+05
Df Model:                                    13                                         
Covariance Type:                      nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
----------