In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [36]:
# Load dataset
df = pd.read_csv("movie_summary.csv")

# Filter as previously done
df = df[
    (df['production_year'] > 2000) &
    (df['genre'] != "Reality") &
    (df['domestic_box_office'] < 50000000) &
    (df['international_box_office'] < 50000000) &
    (df['production_budget'] != 0) &
    (df['domestic_box_office'] != 0) &
    df['genre'].notna() &
    df['domestic_box_office'].notna()
].copy()

# Feature Engineering
df['log_domestic_box_office'] = np.log1p(df['domestic_box_office'])
df['log_budget'] = np.log1p(df['production_budget'])
df['profit'] = df['domestic_box_office'] - df['production_budget']
df['log_profit'] = np.log1p(df['profit'])

# Drop rows with missing engineered values
df = df.dropna(subset=['log_profit', 'log_budget', 'genre','production_year','theatrical_engagements','creative_type'])

# Linear regression formula
formula = 'log_profit ~  C(genre) + creative_type + theatrical_engagements + log_domestic_box_office'

# Fit the model
model = smf.ols(formula, data=df).fit()

# Print the summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             log_profit   R-squared:                       0.649
Model:                            OLS   Adj. R-squared:                  0.639
Method:                 Least Squares   F-statistic:                     66.79
Date:                Tue, 27 May 2025   Prob (F-statistic):          3.46e-149
Time:                        16:20:39   Log-Likelihood:                -942.61
No. Observations:                 744   AIC:                             1927.
Df Residuals:                     723   BIC:                             2024.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [23]:
# Feature Engineering
df['log_international_box_office'] = np.log1p(df['international_box_office'])

df = df.dropna(subset=['log_international_box_office', 'log_budget', 'genre','production_year','theatrical_engagements','creative_type'])

# Linear regression formula
formula = 'log_international_box_office ~ log_budget + C(genre) + production_year + theatrical_engagements + creative_type'

# Fit the model
model = smf.ols(formula, data=df).fit()

# Print the summary
print(model.summary())

                                 OLS Regression Results                                 
Dep. Variable:     log_international_box_office   R-squared:                       0.253
Model:                                      OLS   Adj. R-squared:                  0.246
Method:                           Least Squares   F-statistic:                     35.42
Date:                          Tue, 27 May 2025   Prob (F-statistic):          2.22e-128
Time:                                  16:02:58   Log-Likelihood:                -7080.7
No. Observations:                          2327   AIC:                         1.421e+04
Df Residuals:                              2304   BIC:                         1.434e+04
Df Model:                                    22                                         
Covariance Type:                      nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-