# **Statistics(I)(2) - Final Project Code**
## **Task 1**

In [72]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
import statsmodels.stats.api as sms
from scipy import stats
import seaborn as sns
import math

In [73]:
def f_test_variances(x1, x2, sides, alpha):
    a1 = np.array(x1)
    a2 = np.array(x2)

    result = np.full((7, 2), None, dtype=float)
    
    result[0] = [np.mean(a1), np.mean(a2)]                    # Means
    result[1] = [np.std(a1, ddof=1), np.std(a2, ddof=1)]      # Sample standard deviations
    result[2] = [a1.size, a2.size]                            # Sample sizes

    dfn, dfd = a1.size - 1, a2.size - 1
    result[3] = [dfn, dfd]

    # F-statistic: ratio of sample variances
    f_stat = np.var(a1, ddof=1) / np.var(a2, ddof=1)
    result[4, 0] = f_stat

    # Critical F-values
    if sides > 1:
        result[5] = [
            stats.f.isf(alpha / 2, dfn, dfd),  # Upper critical value
            stats.f.ppf(alpha / 2, dfn, dfd)   # Lower critical value
        ]
    else:
        result[5] = [
            stats.f.isf(alpha, dfn, dfd),      # One-sided critical value
            stats.f.ppf(alpha, dfn, dfd)
        ]

    if f_stat > 1:
        p_value = 1 - stats.f.cdf(f_stat, dfn, dfd)
    else:
        p_value = stats.f.cdf(f_stat, dfn, dfd)
    if sides > 1:
        p_value *= 2
    result[6, 0] = p_value

    labels = ['Mean', 'Std Dev', 'Size', 'Degrees of Freedom', 'F-statistic', 'F-critical', 'p-value']
    df = pd.DataFrame(result, index=labels)

    return df

In [74]:
def t_test_summary(x1, x2, usevar='pooled', alpha=0.05):
    """
    Runs a t-test using statsmodels' CompareMeans and prints a summary.

    Parameters:
    - df: DataFrame containing the columns
    - col1, col2: Column names to compare
    - usevar: 'pooled' (equal variances) or 'unequal' (Welch)
    - alpha: Significance level
    """
    group1 = sms.DescrStatsW(x1)
    group2 = sms.DescrStatsW(x2)
    t_test = sms.CompareMeans(group1, group2)
    
    return t_test.summary(usevar=usevar, alpha=alpha)

In [75]:
df = pd.read_csv("movie_summary.csv")

filtered_df = df[
    (df['production_budget'] != 0) &
    (df['production_year'] > 2000) &
    df['genre'].notna() &
    df['domestic_box_office'].notna() &
    df['international_box_office'].notna()
].copy()

filtered_df["total_box_office"] = filtered_df["domestic_box_office"] + filtered_df["international_box_office"]
filtered_df["flattened_production_budget"] = np.pow(filtered_df["production_budget"], 1/8)
filtered_df["flattened_total_box_office"] = np.pow(filtered_df["total_box_office"], 1/8)

filtered_df['z_score'] = stats.zscore(filtered_df['flattened_total_box_office'])
filtered_df = filtered_df[filtered_df['z_score'].abs() <= 3].copy()

filtered_df['budget_category'] = pd.cut(
    filtered_df['production_budget'], 
    bins=[0, 15000000, np.inf], 
    labels=['Low', 'High']
)

budget_low = filtered_df[filtered_df['budget_category'] == 'Low']['flattened_total_box_office']
budget_high = filtered_df[filtered_df['budget_category'] == 'High']['flattened_total_box_office']

In [76]:
f_test_variances(budget_low, budget_high, sides=2, alpha=0.05)

Unnamed: 0,0,1
Mean,6.873283,9.823617
Std Dev,2.046081,1.699519
Size,1520.0,2184.0
Degrees of Freedom,1519.0,2183.0
F-statistic,1.449417,
F-critical,1.096643,0.911162
p-value,2.442491e-15,


In [77]:
t_test_summary(budget_low, budget_high, usevar='unequal', alpha=0.05)

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
subset #1,-2.9503,0.064,-46.208,0.000,-3.076,-2.825
