In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score

df = pd.read_csv('merged_steam_data.csv')

feature_columns = [
    'price_x',
    'ccu',  
]    
X = df[feature_columns].values
y = df['ccu'].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Define and train the ANN
model = MLPRegressor(
    hidden_layer_sizes=(64, 32),
    activation='relu',
    alpha=0.01,
    batch_size=32,
    learning_rate_init=0.001,
    max_iter=200000,
    random_state=42,
)

print("\nTraining the MLPRegressor...")
model.fit(X_train, y_train)

y_prediction = model.predict(X_test)

y_prediction_postprocessed = np.maximum(0, y_prediction)

# Plot 1: Price vs CCU scatter plot
plt.figure(figsize=(12, 6))
plt.scatter(df['price_x'], df['ccu'], alpha=0.5, c='blue', label='Actual Data')
plt.xlabel('Price ($)')
plt.ylabel('CCU')
plt.title('Price vs CCU Relationship (Logarithmic CCU Scale)')
plt.yscale('log')
plt.grid(True)
plt.legend()
plt.savefig('visualizations/price_vs_ccu.png')
plt.close()
print("Saved 'price_vs_ccu.png'")

all_data_predictions = model.predict(X_scaled)
all_data_predictions_postprocessed = np.maximum(0, all_data_predictions)

# Plot 2: Actual vs Predicted CCU
plt.figure(figsize=(12, 6))

actual_ccu_all = df['ccu'].values

plt.scatter(actual_ccu_all, all_data_predictions_postprocessed, alpha=0.5, c='red', label='Model Predictions')

min_val = min(actual_ccu_all.min(), all_data_predictions_postprocessed.min())
max_val = max(actual_ccu_all.max(), all_data_predictions_postprocessed.max())
plt.plot([min_val, max_val], [min_val, max_val], 'k--', lw=2, label='Perfect Prediction Line')


plt.xlabel('Actual CCU (Log Scale)')
plt.ylabel('Predicted CCU (Log Scale)')
r2 = r2_score(y_test, y_prediction_postprocessed)
plt.title('Actual vs Predicted CCU on Entire Dataset (Logarithmic Scale)')
plt.xscale('log')
plt.yscale('log')
plt.grid(True)
plt.legend()
plt.savefig('visualizations/actual_vs_predicted_ccu.png')
plt.close()

Available columns in the dataset:
['appid', 'name_x', 'price_x', 'ccu', 'type', 'name_y', 'required_age', 'is_free', 'controller_support', 'dlc', 'detailed_description', 'about_the_game', 'short_description', 'fullgame', 'supported_languages', 'header_image', 'website', 'pc_requirements', 'mac_requirements', 'linux_requirements', 'legal_notice', 'drm_notice', 'ext_user_account_notice', 'developers', 'publishers', 'demos', 'price_y', 'packages', 'package_groups', 'platforms', 'metacritic', 'reviews', 'categories', 'genres', 'screenshots', 'movies', 'recommendations', 'achievements', 'release_date', 'support_info', 'background', 'content_descriptors']

Training the MLPClassifier...

Visualizations have been saved in the 'visualizations' folder!
R² Score: -0.13745265
