In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score


# Load merged dataset
df = pd.read_csv('merged_steam_data.csv')


# Print available columns
print("Available columns in the dataset:")
print(df.columns.tolist())


# Create binary target: popular if ccu > 5000
df['popular'] = (df['ccu'] > 5000).astype(int)


# Select features for training - using only price
feature_columns = ['price_x']
X = df[feature_columns].values
y = df['popular'].values


# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


# Define and train the ANN
model = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    activation='relu',
    alpha=0.01,
    batch_size=32,
    learning_rate_init=0.001,
    max_iter=200,
    random_state=42
)


print("\nTraining the MLPClassifier...")
model.fit(X_train, y_train)


# Get predictions
y_pred = model.predict(X_test)


# Create visualization directory if it doesn't exist
import os
if not os.path.exists('visualizations'):
    os.makedirs('visualizations')


# Plot 1: Price vs CCU scatter plot
plt.figure(figsize=(12, 6))
plt.scatter(df['price_x'], df['ccu'], alpha=0.5, c='blue', label='Actual Data')
plt.xlabel('Price ($)')
plt.ylabel('CCU')
plt.title('Price vs CCU Relationship')
plt.yscale('log')  # Logarithmic scale for CCU
plt.grid(True)
plt.legend()
plt.savefig('visualizations/price_vs_ccu.png')
plt.close()


# Plot 2: Actual vs Predicted CCU
plt.figure(figsize=(12, 6))


# Get the test set indices
test_indices = X_test.shape[0]
actual_ccu = df['ccu'].iloc[-test_indices:].values
predicted_ccu = np.where(y_pred == 1, df['ccu'].mean() * 1.5, df['ccu'].mean() * 0.5)


plt.scatter(actual_ccu, predicted_ccu, alpha=0.5, c='red', label='Predictions')
plt.plot([min(actual_ccu), max(actual_ccu)], [min(predicted_ccu), max(predicted_ccu)],
         'k--', label='Perfect Prediction')


plt.xlabel('Actual CCU')
plt.ylabel('Predicted CCU')
plt.title('Actual vs Predicted CCU (Logarithmic Scale)')
plt.xscale('log')
plt.yscale('log')
plt.grid(True)
plt.legend()


# Calculate and display R² score
r2 = r2_score(actual_ccu, predicted_ccu)
plt.text(0.05, 0.95, f'R² Score: {r2:.8f}', transform=plt.gca().transAxes)


plt.savefig('visualizations/actual_vs_predicted_ccu.png')
plt.close()


print("\nVisualizations have been saved in the 'visualizations' folder!")
print(f"R² Score: {r2:.8f}")







Available columns in the dataset:
['appid', 'name_x', 'price_x', 'ccu', 'type', 'name_y', 'required_age', 'is_free', 'controller_support', 'dlc', 'detailed_description', 'about_the_game', 'short_description', 'fullgame', 'supported_languages', 'header_image', 'website', 'pc_requirements', 'mac_requirements', 'linux_requirements', 'legal_notice', 'drm_notice', 'ext_user_account_notice', 'developers', 'publishers', 'demos', 'price_y', 'packages', 'package_groups', 'platforms', 'metacritic', 'reviews', 'categories', 'genres', 'screenshots', 'movies', 'recommendations', 'achievements', 'release_date', 'support_info', 'background', 'content_descriptors']

Training the MLPClassifier...

Visualizations have been saved in the 'visualizations' folder!
R² Score: -0.13745265
