In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler


In [2]:
# Load the data
file_path = 'merged_data/final_data.csv'
data = pd.read_csv(file_path)

In [3]:
# Define thresholds for masterpieces and flops
masterpiece_threshold = 8.0
flop_threshold = 4.0

In [4]:
# Create new features for masterpieces and flops
data['is_masterpiece'] = data['average_quality_score'] >= masterpiece_threshold
data['is_flop'] = data['average_quality_score'] <= flop_threshold

# Aggregate the number of masterpieces and flops by country
masterpieces_per_country = data.groupby('Country_Code')['is_masterpiece'].sum().reset_index()
flops_per_country = data.groupby('Country_Code')['is_flop'].sum().reset_index()

# Merge the new features back into the main dataframe
data = pd.merge(data, masterpieces_per_country, on='Country_Code', suffixes=('', '_total_masterpieces'))
data = pd.merge(data, flops_per_country, on='Country_Code', suffixes=('', '_total_flops'))

# Compute variance in movie ratings for each country
variance_per_country = data.groupby('Country_Code')['average_quality_score'].var().reset_index()
variance_per_country.rename(columns={'average_quality_score': 'rating_variance'}, inplace=True)

# Merge the variance feature back into the main dataframe
data = pd.merge(data, variance_per_country, on='Country_Code')

In [5]:
# Analyze the top X movies (e.g., top 10) for each country
top_x = 10
top_x_movies_per_country = data.groupby('Country_Code').apply(lambda x: x.nlargest(top_x, 'average_quality_score')).reset_index(drop=True)
average_top_x_ratings = top_x_movies_per_country.groupby('Country_Code')['average_quality_score'].mean().reset_index()
average_top_x_ratings.rename(columns={'average_quality_score': 'average_top_x_ratings'}, inplace=True)

# Merge the top X ratings feature back into the main dataframe
data = pd.merge(data, average_top_x_ratings, on='Country_Code')

  top_x_movies_per_country = data.groupby('Country_Code').apply(lambda x: x.nlargest(top_x, 'average_quality_score')).reset_index(drop=True)


In [6]:
# Normalize the numerical features
numerical_features = ['total_votes', 'average_quality_score', 'GDP', 'Population', 
                      'GDP_per_Capital', 'population_rank', 'gdp_rank', 
                      'gdp_per_capita_rank', 'total_votes_rank', 'average_quality_rank', 
                      'is_masterpiece_total_masterpieces', 'is_flop_total_flops', 
                      'rating_variance', 'average_top_x_ratings']

scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

In [7]:
# Define the features and target variables
X = data[numerical_features + ['Country_Code']]
y_strong = data['strong_hegemony']
y_weak = data['weak_hegemony']


In [8]:
# One-hot encoding for the categorical feature 'Country_Code'
X = pd.get_dummies(X, columns=['Country_Code'])

# Split the data into training and testing sets for both target variables
X_train_strong, X_test_strong, y_train_strong, y_test_strong = train_test_split(X, y_strong, test_size=0.2, random_state=42)
X_train_weak, X_test_weak, y_train_weak, y_test_weak = train_test_split(X, y_weak, test_size=0.2, random_state=42)


In [9]:
# Define a function to train and evaluate models
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

In [10]:
# Initialize the models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGBoost': XGBRegressor()
}


In [11]:
# Train and evaluate the models for strong hegemony
results_strong = {}
for model_name, model in models.items():
    mse, r2 = train_and_evaluate(model, X_train_strong, X_test_strong, y_train_strong, y_test_strong)
    results_strong[model_name] = {'MSE': mse, 'R2': r2}

# Train and evaluate the models for weak hegemony
results_weak = {}
for model_name, model in models.items():
    mse, r2 = train_and_evaluate(model, X_train_weak, X_test_weak, y_train_weak, y_test_weak)
    results_weak[model_name] = {'MSE': mse, 'R2': r2}

In [12]:
# Display the results
print("Strong Hegemony Results:")
for model_name, metrics in results_strong.items():
    print(f"{model_name} - MSE: {metrics['MSE']}, R2: {metrics['R2']}")

print("\nWeak Hegemony Results:")
for model_name, metrics in results_weak.items():
    print(f"{model_name} - MSE: {metrics['MSE']}, R2: {metrics['R2']}")

Strong Hegemony Results:
Linear Regression - MSE: 4.231419054340598e-23, R2: 1.0
Decision Tree - MSE: 10189.3024012006, R2: 0.9993923611737815
Gradient Boosting - MSE: 34792.805161973294, R2: 0.9979251318238449
XGBoost - MSE: 9162.681912291193, R2: 0.999453583664222

Weak Hegemony Results:
Linear Regression - MSE: 2.6311781124122382e-23, R2: 1.0
Decision Tree - MSE: 10826.441970985492, R2: 0.997596166216431
Gradient Boosting - MSE: 23961.970079035887, R2: 0.994679637746988
XGBoost - MSE: 8039.906323355653, R2: 0.9982148707314363
