# ML Project : Predicting Game Success on Steam using Machine Learning

# Noor Ul Ain Afaq
# FA24-MSDS-0011

# Objective :For this project, we will predict video game success using a composite metric that combines financial performance (estimated revenue calculated from owners and price), player satisfaction (percentage of positive reviews and overall review count), and engagement metrics (average playtime and peak concurrent users). This multi-dimensional approach provides a more nuanced understanding of success than any single metric alone, acknowledging that truly successful games tend to perform well across commercial, critical, and engagement dimensions.

In [1]:
import pandas as pd
import numpy as np
import ast
from datetime import datetime

In [2]:
df = pd.read_csv('games_march2025_cleaned.csv')

In [3]:
df.shape

(89618, 47)

In [4]:
df.head(5)

Unnamed: 0,appid,name,release_date,required_age,price,dlc_count,detailed_description,about_the_game,short_description,reviews,...,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
0,730,Counter-Strike 2,2012-08-21,0,0.0,1,"For over two decades, Counter-Strike has offer...","For over two decades, Counter-Strike has offer...","For over two decades, Counter-Strike has offer...",,...,879,5174,350,0,1212356,"{'FPS': 90857, 'Shooter': 65397, 'Multiplayer'...",86,8632939,82,96473
1,578080,PUBG: BATTLEGROUNDS,2017-12-21,0,0.0,0,"LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ...","LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ...",Play PUBG: BATTLEGROUNDS for free. Land on str...,,...,0,0,0,0,616738,"{'Survival': 14838, 'Shooter': 12727, 'Battle ...",59,2513842,68,16720
2,570,Dota 2,2013-07-09,0,0.0,2,"The most-played game on Steam. Every day, mill...","The most-played game on Steam. Every day, mill...","Every day, millions of players worldwide enter...",“A modern multiplayer masterpiece.” 9.5/10 – D...,...,1536,898,892,0,555977,"{'Free to Play': 59933, 'MOBA': 20158, 'Multip...",81,2452595,80,29366
3,271590,Grand Theft Auto V Legacy,2015-04-13,17,0.0,0,"When a young street hustler, a retired bank ro...","When a young street hustler, a retired bank ro...",Grand Theft Auto V for PC offers players the o...,,...,771,7101,74,0,117698,"{'Open World': 32644, 'Action': 23539, 'Multip...",87,1803832,92,17517
4,359550,Tom Clancy's Rainbow Six® Siege,2015-12-01,17,3.99,9,Edition Comparison Ultimate Edition The Tom Cl...,“One of the best first-person shooters ever ma...,"Tom Clancy's Rainbow Six® Siege is an elite, t...",,...,682,2434,306,80,89916,"{'FPS': 9831, 'PvP': 9162, 'e-sports': 9072, '...",84,1168020,76,12608


In [5]:
df.columns


Index(['appid', 'name', 'release_date', 'required_age', 'price', 'dlc_count',
       'detailed_description', 'about_the_game', 'short_description',
       'reviews', 'header_image', 'website', 'support_url', 'support_email',
       'windows', 'mac', 'linux', 'metacritic_score', 'metacritic_url',
       'achievements', 'recommendations', 'notes', 'supported_languages',
       'full_audio_languages', 'packages', 'developers', 'publishers',
       'categories', 'genres', 'screenshots', 'movies', 'user_score',
       'score_rank', 'positive', 'negative', 'estimated_owners',
       'average_playtime_forever', 'average_playtime_2weeks',
       'median_playtime_forever', 'median_playtime_2weeks', 'discount',
       'peak_ccu', 'tags', 'pct_pos_total', 'num_reviews_total',
       'pct_pos_recent', 'num_reviews_recent'],
      dtype='object')

In [6]:
duplicates_specific = df.duplicated(subset=['name', 'release_date'])

In [7]:
df = df.drop_duplicates(subset=['name', 'release_date'])

In [8]:
df.shape

(89586, 47)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89586 entries, 0 to 89617
Data columns (total 47 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   appid                     89586 non-null  int64  
 1   name                      89586 non-null  object 
 2   release_date              89586 non-null  object 
 3   required_age              89586 non-null  int64  
 4   price                     89586 non-null  float64
 5   dlc_count                 89586 non-null  int64  
 6   detailed_description      89389 non-null  object 
 7   about_the_game            89366 non-null  object 
 8   short_description         89466 non-null  object 
 9   reviews                   10390 non-null  object 
 10  header_image              89586 non-null  object 
 11  website                   41083 non-null  object 
 12  support_url               44082 non-null  object 
 13  support_email             78785 non-null  object 
 14  windows    

In [10]:
columns_to_drop = [
    'appid', 'header_image', 'website', 'support_url', 'support_email',
    'notes', 'packages', 'screenshots', 'movies', 'metacritic_url'
]

df = df.drop(columns=columns_to_drop)


In [11]:
print(df.columns)


Index(['name', 'release_date', 'required_age', 'price', 'dlc_count',
       'detailed_description', 'about_the_game', 'short_description',
       'reviews', 'windows', 'mac', 'linux', 'metacritic_score',
       'achievements', 'recommendations', 'supported_languages',
       'full_audio_languages', 'developers', 'publishers', 'categories',
       'genres', 'user_score', 'score_rank', 'positive', 'negative',
       'estimated_owners', 'average_playtime_forever',
       'average_playtime_2weeks', 'median_playtime_forever',
       'median_playtime_2weeks', 'discount', 'peak_ccu', 'tags',
       'pct_pos_total', 'num_reviews_total', 'pct_pos_recent',
       'num_reviews_recent'],
      dtype='object')


In [12]:
columns_to_drop = [
    'name', 'detailed_description', 'about_the_game', 'short_description',
    'reviews', 'full_audio_languages', 'score_rank'
]


In [13]:

df = df.drop(columns=columns_to_drop)
print(df.columns)

Index(['release_date', 'required_age', 'price', 'dlc_count', 'windows', 'mac',
       'linux', 'metacritic_score', 'achievements', 'recommendations',
       'supported_languages', 'developers', 'publishers', 'categories',
       'genres', 'user_score', 'positive', 'negative', 'estimated_owners',
       'average_playtime_forever', 'average_playtime_2weeks',
       'median_playtime_forever', 'median_playtime_2weeks', 'discount',
       'peak_ccu', 'tags', 'pct_pos_total', 'num_reviews_total',
       'pct_pos_recent', 'num_reviews_recent'],
      dtype='object')


In [14]:
df.tail(5)

Unnamed: 0,release_date,required_age,price,dlc_count,windows,mac,linux,metacritic_score,achievements,recommendations,...,average_playtime_2weeks,median_playtime_forever,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent
89613,2024-02-29,0,3.99,0,True,False,False,0,6,0,...,0,0,0,0,0,"{'2D Platformer': 91, 'Runner': 85, 'Platforme...",-1,-1,-1,-1
89614,2020-01-07,0,10.0,0,True,False,False,0,0,0,...,0,0,0,0,0,"{'Action': 43, 'Adventure': 41, 'Indie': 41, '...",-1,-1,-1,-1
89615,2019-10-10,0,1.99,0,True,False,False,0,0,0,...,0,0,0,0,0,"{'Indie': 31, 'Early Access': 21}",-1,-1,-1,-1
89616,2025-01-25,0,29.99,0,True,False,False,0,34,0,...,0,0,0,0,0,"{'RPG': 117, 'Action-Adventure': 111, 'Strateg...",-1,-1,-1,-1
89617,2018-03-01,0,9.99,0,True,False,False,0,12,0,...,0,0,0,0,0,"{'Action': 22, 'Indie': 21}",-1,-1,-1,-1


In [15]:
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')


In [16]:
df['release_date']

0       2012-08-21
1       2017-12-21
2       2013-07-09
3       2015-04-13
4       2015-12-01
           ...    
89613   2024-02-29
89614   2020-01-07
89615   2019-10-10
89616   2025-01-25
89617   2018-03-01
Name: release_date, Length: 89586, dtype: datetime64[ns]

In [17]:
df['estimated_owners']

0        100000000 - 200000000
1         50000000 - 100000000
2        200000000 - 500000000
3         50000000 - 100000000
4          20000000 - 50000000
                 ...          
89613                0 - 20000
89614                0 - 20000
89615                0 - 20000
89616                0 - 20000
89617                0 - 20000
Name: estimated_owners, Length: 89586, dtype: object

In [18]:
# Create revenue estimate feature
# As estimated_owners is a range (e.g., "1,000,000-2,000,000"), we will use the midpoint:
df['estimated_owners_numeric'] = df['estimated_owners'].apply(lambda x: sum(map(int, x.split('-')))/2)
df['revenue_estimate'] = df['price'] * df['estimated_owners_numeric']

In [19]:
df['revenue_estimate']

0                0.0
1                0.0
2                0.0
3                0.0
4        139650000.0
            ...     
89613        39900.0
89614       100000.0
89615        19900.0
89616       299900.0
89617        99900.0
Name: revenue_estimate, Length: 89586, dtype: float64

In [20]:
# Convert platform availability to numeric (0/1)
df['windows_numeric'] = df['windows'].astype(int)
df['mac_numeric'] = df['mac'].astype(int)
df['linux_numeric'] = df['linux'].astype(int)

# Create platform count feature (how many platforms is the game available on)
df['platform_count'] = df['windows_numeric'] + df['mac_numeric'] + df['linux_numeric']

In [21]:
df['platform_count']

0        2
1        1
2        3
3        1
4        1
        ..
89613    1
89614    1
89615    1
89616    1
89617    1
Name: platform_count, Length: 89586, dtype: int32

In [22]:
current_year = datetime.now().year
df['years_since_release'] = current_year - df['release_date'].dt.year
df['years_since_release']

0        13
1         8
2        12
3        10
4        10
         ..
89613     1
89614     5
89615     6
89616     0
89617     7
Name: years_since_release, Length: 89586, dtype: int32

In [23]:
# Price and quality interactions
df['price_metacritic_ratio'] = df['price'] / (df['metacritic_score'] + 1)  # +1 to avoid division by zero
df['price_user_score_ratio'] = df['price'] / (df['user_score'] + 0.1)  # +0.1 to avoid division by zero

# Content amount indicators
df['achievements_per_price'] = df['achievements'] / (df['price'] + 0.01)  # +0.01 to avoid division by zero
df['dlc_price_interaction'] = df['dlc_count'] * df['price']

In [24]:
# Calculate price-per-hour
df['price_per_hour'] = df['price'] / (df['median_playtime_forever'] / 60 + 0.1)  # Convert minutes to hours, avoid div by 0

# Content richness metrics
df['engagement_score'] = df['median_playtime_forever'] * df['peak_ccu'] / 1000  # Scale down for numerical stability

# Review efficiency (reviews per owner)
df['review_rate'] = df['num_reviews_total'] / df['estimated_owners_numeric']

In [25]:
df['price_per_hour']

0          0.000000
1          0.000000
2          0.000000
3          0.000000
4          0.098115
            ...    
89613     39.900000
89614    100.000000
89615     19.900000
89616    299.900000
89617     99.900000
Name: price_per_hour, Length: 89586, dtype: float64

In [26]:
df['tags']

0        {'FPS': 90857, 'Shooter': 65397, 'Multiplayer'...
1        {'Survival': 14838, 'Shooter': 12727, 'Battle ...
2        {'Free to Play': 59933, 'MOBA': 20158, 'Multip...
3        {'Open World': 32644, 'Action': 23539, 'Multip...
4        {'FPS': 9831, 'PvP': 9162, 'e-sports': 9072, '...
                               ...                        
89613    {'2D Platformer': 91, 'Runner': 85, 'Platforme...
89614    {'Action': 43, 'Adventure': 41, 'Indie': 41, '...
89615                    {'Indie': 31, 'Early Access': 21}
89616    {'RPG': 117, 'Action-Adventure': 111, 'Strateg...
89617                          {'Action': 22, 'Indie': 21}
Name: tags, Length: 89586, dtype: object

In [27]:
# Step 1: Convert string to real dictionary (if needed)
df['tags'] = df['tags'].fillna('{}').apply(ast.literal_eval)

# Step 2: Flatten and count all tags
from collections import Counter

tag_counter = Counter()
df['tags'].apply(lambda x: tag_counter.update(x))

# Step 3: (Optional) Keep only most common tags
top_tags = [tag for tag, count in tag_counter.most_common(10)]
print("Top 10 tags:", top_tags)


# --- Step 4: Extract vote count for each of those top tags ---
def extract_tag_votes(tag_dict, tag_list):
    if not isinstance(tag_dict, dict):
        return [0] * len(tag_list)
    return [tag_dict.get(tag, 0) for tag in tag_list]

tag_vectors = df['tags'].apply(lambda x: extract_tag_votes(x, top_tags))

# --- Step 5: Create new DataFrame with tag columns ---
tag_df = pd.DataFrame(tag_vectors.tolist(), columns=[f'tag_{tag}' for tag in top_tags], index=df.index)

# --- Step 6: Merge tag features into your main dataframe ---
df = pd.concat([df, tag_df], axis=1)


Top 10 tags: ['Action', 'Adventure', 'Singleplayer', 'Casual', 'Indie', '2D', 'Strategy', 'Simulation', 'RPG', 'Exploration']


In [28]:
df

Unnamed: 0,release_date,required_age,price,dlc_count,windows,mac,linux,metacritic_score,achievements,recommendations,...,tag_Action,tag_Adventure,tag_Singleplayer,tag_Casual,tag_Indie,tag_2D,tag_Strategy,tag_Simulation,tag_RPG,tag_Exploration
0,2012-08-21,0,0.00,1,True,False,True,0,1,4401572,...,47512,0,0,0,0,0,30111,0,0,0
1,2017-12-21,0,0.00,0,True,False,False,0,37,1732007,...,6013,0,0,0,0,0,3121,1390,0,0
2,2013-07-09,0,0.00,2,True,True,True,90,0,14337,...,7920,0,0,0,0,0,14252,1979,3792,0
3,2015-04-13,17,0.00,0,True,False,False,96,77,1803063,...,23539,13643,12512,0,0,0,0,0,0,0
4,2015-12-01,17,3.99,9,True,False,False,0,0,1165929,...,8928,0,0,0,0,0,4921,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89613,2024-02-29,0,3.99,0,True,False,False,0,6,0,...,0,0,62,70,48,79,0,0,0,0
89614,2020-01-07,0,10.00,0,True,False,False,0,0,0,...,43,41,0,0,41,0,40,0,0,0
89615,2019-10-10,0,1.99,0,True,False,False,0,0,0,...,0,0,0,0,31,0,0,0,0,0
89616,2025-01-25,0,29.99,0,True,False,False,0,34,0,...,0,75,43,0,0,0,107,102,117,0


In [29]:
df.columns

Index(['release_date', 'required_age', 'price', 'dlc_count', 'windows', 'mac',
       'linux', 'metacritic_score', 'achievements', 'recommendations',
       'supported_languages', 'developers', 'publishers', 'categories',
       'genres', 'user_score', 'positive', 'negative', 'estimated_owners',
       'average_playtime_forever', 'average_playtime_2weeks',
       'median_playtime_forever', 'median_playtime_2weeks', 'discount',
       'peak_ccu', 'tags', 'pct_pos_total', 'num_reviews_total',
       'pct_pos_recent', 'num_reviews_recent', 'estimated_owners_numeric',
       'revenue_estimate', 'windows_numeric', 'mac_numeric', 'linux_numeric',
       'platform_count', 'years_since_release', 'price_metacritic_ratio',
       'price_user_score_ratio', 'achievements_per_price',
       'dlc_price_interaction', 'price_per_hour', 'engagement_score',
       'review_rate', 'tag_Action', 'tag_Adventure', 'tag_Singleplayer',
       'tag_Casual', 'tag_Indie', 'tag_2D', 'tag_Strategy', 'tag_Simulation

In [30]:
# Select final features for modeling
feature_cols = [
    # Basic game info
    'price', 'required_age', 'dlc_count', 
    
    # Platform availability
    'windows_numeric', 'mac_numeric', 'linux_numeric', 'platform_count',
    
    # Temporal features
    'years_since_release',
    
    # Quality indicators
    'metacritic_score', 'achievements', 'recommendations',
    
    # Engagement metrics
    'average_playtime_forever', 'median_playtime_forever', 'peak_ccu',
    
    # Review metrics
    'pct_pos_total', 'num_reviews_total',
    
    # Engineered features
    'revenue_estimate', 'price_metacritic_ratio', 'price_per_hour', 'engagement_score',
    
    # tag columns(Most Common)
    'tag_Action', 'tag_Adventure', 'tag_Singleplayer', 'tag_Casual', 'tag_Indie', 'tag_2D', 'tag_Strategy', 'tag_Simulation', 'tag_RPG', 'tag_Exploration'
]

# Final feature set
X = df[feature_cols]

In [31]:
X

Unnamed: 0,price,required_age,dlc_count,windows_numeric,mac_numeric,linux_numeric,platform_count,years_since_release,metacritic_score,achievements,...,tag_Action,tag_Adventure,tag_Singleplayer,tag_Casual,tag_Indie,tag_2D,tag_Strategy,tag_Simulation,tag_RPG,tag_Exploration
0,0.00,0,1,1,0,1,2,13,0,1,...,47512,0,0,0,0,0,30111,0,0,0
1,0.00,0,0,1,0,0,1,8,0,37,...,6013,0,0,0,0,0,3121,1390,0,0
2,0.00,0,2,1,1,1,3,12,90,0,...,7920,0,0,0,0,0,14252,1979,3792,0
3,0.00,17,0,1,0,0,1,10,96,77,...,23539,13643,12512,0,0,0,0,0,0,0
4,3.99,17,9,1,0,0,1,10,0,0,...,8928,0,0,0,0,0,4921,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89613,3.99,0,0,1,0,0,1,1,0,6,...,0,0,62,70,48,79,0,0,0,0
89614,10.00,0,0,1,0,0,1,5,0,0,...,43,41,0,0,41,0,40,0,0,0
89615,1.99,0,0,1,0,0,1,6,0,0,...,0,0,0,0,31,0,0,0,0,0
89616,29.99,0,0,1,0,0,1,0,0,34,...,0,75,43,0,0,0,107,102,117,0


In [32]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89586 entries, 0 to 89617
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   price                     89586 non-null  float64
 1   required_age              89586 non-null  int64  
 2   dlc_count                 89586 non-null  int64  
 3   windows_numeric           89586 non-null  int32  
 4   mac_numeric               89586 non-null  int32  
 5   linux_numeric             89586 non-null  int32  
 6   platform_count            89586 non-null  int32  
 7   years_since_release       89586 non-null  int32  
 8   metacritic_score          89586 non-null  int64  
 9   achievements              89586 non-null  int64  
 10  recommendations           89586 non-null  int64  
 11  average_playtime_forever  89586 non-null  int64  
 12  median_playtime_forever   89586 non-null  int64  
 13  peak_ccu                  89586 non-null  int64  
 14  pct_pos_tot

In [33]:
X.columns

Index(['price', 'required_age', 'dlc_count', 'windows_numeric', 'mac_numeric',
       'linux_numeric', 'platform_count', 'years_since_release',
       'metacritic_score', 'achievements', 'recommendations',
       'average_playtime_forever', 'median_playtime_forever', 'peak_ccu',
       'pct_pos_total', 'num_reviews_total', 'revenue_estimate',
       'price_metacritic_ratio', 'price_per_hour', 'engagement_score',
       'tag_Action', 'tag_Adventure', 'tag_Singleplayer', 'tag_Casual',
       'tag_Indie', 'tag_2D', 'tag_Strategy', 'tag_Simulation', 'tag_RPG',
       'tag_Exploration'],
      dtype='object')

In [34]:
# Create composite success score based on multiple dimensions
df['financial_success'] = df['revenue_estimate'].rank(pct=True)
df['engagement_success'] = (df['median_playtime_forever'].rank(pct=True) + 
                          df['peak_ccu'].rank(pct=True)) / 2
df['critical_success'] = (df['metacritic_score'].rank(pct=True) + 
                        df['pct_pos_total'].rank(pct=True)) / 2

# Combine into final success score (weighted as you prefer)
df['success_score'] = (0.4 * df['financial_success'] + 
                       0.3 * df['engagement_success'] + 
                       0.3 * df['critical_success'])

In [35]:
from sklearn.model_selection import train_test_split

# Define X (features) and y (target)
X = df[['price', 'required_age', 'dlc_count', 'windows_numeric', 'mac_numeric',
       'linux_numeric', 'platform_count', 'years_since_release',
       'metacritic_score', 'achievements', 'recommendations',
       'average_playtime_forever', 'median_playtime_forever', 'peak_ccu',
       'pct_pos_total', 'num_reviews_total', 'revenue_estimate',
       'price_metacritic_ratio', 'price_per_hour', 'engagement_score',
       'tag_Action', 'tag_Adventure', 'tag_Singleplayer', 'tag_Casual',
       'tag_Indie', 'tag_2D', 'tag_Strategy', 'tag_Simulation', 'tag_RPG',
       'tag_Exploration']]
y = df['success_score']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [37]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.1),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    # Train model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Evaluate
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    results[name] = {'RMSE': rmse, 'R²': r2}
    
    print(f"{name} - RMSE: {rmse:.4f}, R²: {r2:.4f}")

Linear Regression - RMSE: 0.1073, R²: 0.5470
Ridge - RMSE: 0.1073, R²: 0.5470
Lasso - RMSE: 0.1594, R²: -0.0001
Random Forest - RMSE: 0.0035, R²: 0.9995
Gradient Boosting - RMSE: 0.0059, R²: 0.9986


In [38]:
# Get feature importance from Random Forest
importances = models['Random Forest'].feature_importances_
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values('Importance', ascending=False)

print("Top 10 Most Important Features for Game Success:")
print(feature_importance.head(10))

Top 10 Most Important Features for Game Success:
                     Feature  Importance
16          revenue_estimate    0.744108
13                  peak_ccu    0.148239
14             pct_pos_total    0.077451
19          engagement_score    0.008594
8           metacritic_score    0.008044
12   median_playtime_forever    0.005213
11  average_playtime_forever    0.004350
10           recommendations    0.001394
15         num_reviews_total    0.001175
17    price_metacritic_ratio    0.000786


In [39]:
# For each top feature, create an insight statement
top_features = feature_importance.head(10)['Feature'].tolist()
print(top_features)
for feature in top_features:
    # Get correlation direction (positive/negative)
    correlation = np.corrcoef(X[feature], y)[0,1]
    direction = "positively" if correlation > 0 else "negatively"
    
    print(f"Feature '{feature}' impacts game success {direction}.")
    print(f"A one standard deviation change in {feature} changes the success score by approximately {importances[X.columns.get_loc(feature)]:.4f} standard deviations.\n")

['revenue_estimate', 'peak_ccu', 'pct_pos_total', 'engagement_score', 'metacritic_score', 'median_playtime_forever', 'average_playtime_forever', 'recommendations', 'num_reviews_total', 'price_metacritic_ratio']
Feature 'revenue_estimate' impacts game success positively.
A one standard deviation change in revenue_estimate changes the success score by approximately 0.7441 standard deviations.

Feature 'peak_ccu' impacts game success positively.
A one standard deviation change in peak_ccu changes the success score by approximately 0.1482 standard deviations.

Feature 'pct_pos_total' impacts game success positively.
A one standard deviation change in pct_pos_total changes the success score by approximately 0.0775 standard deviations.

Feature 'engagement_score' impacts game success positively.
A one standard deviation change in engagement_score changes the success score by approximately 0.0086 standard deviations.

Feature 'metacritic_score' impacts game success positively.
A one standard 

In [40]:
# Function to predict success probability for a game scenario
def predict_success_probability(feature_values, model=models['Random Forest'], scaler=scaler):
    # Convert input features to array
    features = np.array([feature_values])
    
    # Scale features
    features_scaled = scaler.transform(features)
    
    # Make prediction
    success_score = model.predict(features_scaled)[0]
    
    return success_score

# Example: Predict success for a hypothetical game
example_game = {
    'price': 19.99,
    'required_age': 0,
    'dlc_count': 2,
    'windows_numeric': 1,
    'mac_numeric': 1,
    'linux_numeric': 1,
    'platform_count': 3,
    'years_since_release': 0.5,
    'metacritic_score': 85,
    'achievements': 50,
    'recommendations': 10000,
    'average_playtime_forever': 500,
    'median_playtime_forever': 300,
    'peak_ccu': 5000,
    'pct_pos_total': 0.85,
    'num_reviews_total': 8000,
    'revenue_estimate': 1000000,
    'price_metacritic_ratio': 0.235,
    'price_per_hour': 0.067,
    'engagement_score': 1500000,
    'tag_Action': 1,
    'tag_Adventure': 1,
    'tag_Singleplayer': 1,
    'tag_Casual': 0,
    'tag_Indie': 0,
    'tag_2D': 0,
    'tag_Strategy': 0,
    'tag_Simulation': 0,
    'tag_RPG': 1,
    'tag_Exploration': 1,
    'tag_Multiplayer': 0
}

# Convert to the same order as your X columns
example_values = [example_game[feature] for feature in X.columns]
predicted_success = predict_success_probability(example_values)
print(f"Predicted success score: {predicted_success:.4f}")

Predicted success score: 0.8967




In [41]:
# Define success tiers based on your composite score
def categorize_success(success_score):
    if success_score > 0.8:
        return "Blockbuster Success"
    elif success_score > 0.6:
        return "Major Success"
    elif success_score > 0.4:
        return "Moderate Success"
    elif success_score > 0.2:
        return "Minor Success"
    else:
        return "Underperforming"

In [42]:
def generate_recommendations(game_features):
    # Get prediction
    success_score = predict_success_probability([game_features[feature] for feature in X.columns])
    category = categorize_success(success_score)
    
    print(f"Game projected to be: {category} (Score: {success_score:.2f})")
    
    # Generate recommendations based on feature importance
    recommendations = []
    
    # Example logic
    if game_features['price'] > 30 and 'price' in top_features:
        recommendations.append("Consider lowering the price point, as high prices correlate with lower success")
    
    if game_features['tag_Multiplayer'] == 0 and 'tag_Multiplayer' in top_features:
        recommendations.append("Adding multiplayer functionality could significantly boost success potential")
    else:
        recommendations.append("all good")
    for rec in recommendations:
        print("📝", rec)
    return recommendations

In [43]:
recom = generate_recommendations(example_game)

Game projected to be: Blockbuster Success (Score: 0.90)
📝 all good




In [46]:
from openai import OpenAI
prompt = f"""
This game is projected to be moderately successful with a score of 0.65.
Key input features:
- Price: ${example_game['price']}
- Genre tags: Action, RPG, Singleplayer
- Peak CCU: {example_game['peak_ccu']}
- Metacritic Score: {example_game['metacritic_score']}

Based on this, generate a 3-sentence explanation of why this game might succeed or fail and suggest one improvement.
"""

response = openai.ChatCompletion.create(
    model="gpt-4",
    messages=[{"role": "user", "content": prompt}]
)
print(response['choices'][0]['message']['content'])


NameError: name 'openai' is not defined