In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [None]:
# Load dataset
df = pd.read_csv("movie_summary.csv")

# Filter as previously done
df = df[
    (df['production_year'] > 2000) &
    (df['genre'] != "Reality") &
    (df['domestic_box_office'] < 50000000) &
    (df['international_box_office'] < 50000000) &
    (df['production_budget'] != 0) &
    (df['domestic_box_office'] != 0) &
    df['genre'].notna() &
    df['domestic_box_office'].notna()
].copy()

# Feature Engineering
df['log_domestic_box_office'] = np.log1p(df['domestic_box_office'])
df['log_budget'] = np.log1p(df['production_budget'])
df['total_box_office'] = df['domestic_box_office'] + df['international_box_office']
df['log_total_box_office'] = np.log1p(df['total_box_office'])
df['profit'] = df['domestic_box_office'] - df['production_budget']
df['log_profit'] = np.log1p(df['profit'])

# Drop rows with missing engineered values
df = df.dropna(subset=['log_domestic_box_office', 'log_budget', 'genre','production_year','theatrical_engagements','creative_type'])

# Linear regression formula
formula = 'domestic_box_office ~  C(genre) + creative_type + theatrical_engagements '

# Fit the model
model = smf.ols(formula, data=df).fit()

# Print the summary
print(model.summary())

                             OLS Regression Results                            
Dep. Variable:     domestic_box_office   R-squared:                       0.086
Model:                             OLS   Adj. R-squared:                  0.078
Method:                  Least Squares   F-statistic:                     11.41
Date:                 Tue, 27 May 2025   Prob (F-statistic):           3.87e-32
Time:                         18:24:15   Log-Likelihood:                -39285.
No. Observations:                 2203   AIC:                         7.861e+04
Df Residuals:                     2184   BIC:                         7.872e+04
Df Model:                           18                                         
Covariance Type:             nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [39]:
# Feature Engineering
df['log_international_box_office'] = np.log1p(df['international_box_office'])

df = df.dropna(subset=['log_total_box_office','log_international_box_office', 'log_budget', 'genre','production_year','theatrical_engagements','creative_type'])

# Linear regression formula
formula = 'log_international_box_office ~ log_budget + C(genre) + production_year + theatrical_engagements + creative_type'

# Fit the model
model = smf.ols(formula, data=df).fit()

# Print the summary
print(model.summary())

                                 OLS Regression Results                                 
Dep. Variable:     log_international_box_office   R-squared:                       0.261
Model:                                      OLS   Adj. R-squared:                  0.254
Method:                           Least Squares   F-statistic:                     36.76
Date:                          Tue, 27 May 2025   Prob (F-statistic):          6.05e-127
Time:                                  17:37:23   Log-Likelihood:                -6731.8
No. Observations:                          2203   AIC:                         1.351e+04
Df Residuals:                              2181   BIC:                         1.363e+04
Df Model:                                    21                                         
Covariance Type:                      nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-

In [43]:
formula = 'log_total_box_office ~  C(genre) + C(creative_type)  + theatrical_engagements'

# Fit the model
model = smf.ols(formula, data=df).fit()

# Print the summary
print(model.summary())

                             OLS Regression Results                             
Dep. Variable:     log_total_box_office   R-squared:                       0.503
Model:                              OLS   Adj. R-squared:                  0.498
Method:                   Least Squares   F-statistic:                     116.0
Date:                  Tue, 27 May 2025   Prob (F-statistic):          1.88e-313
Time:                          17:38:28   Log-Likelihood:                -4156.9
No. Observations:                  2203   AIC:                             8354.
Df Residuals:                      2183   BIC:                             8468.
Df Model:                            19                                         
Covariance Type:              nonrobust                                         
                                                  coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------