In [None]:
import pandas as pd

df = pd.read_csv("Games List 170.csv")

In [None]:
def normalize_list(col):
  return(
      col.fillna("").str.lower().str.strip().str.replace(r"\s*,\s*", ",", regex = True)
  )

  df["Genre"] = normalize_list(df["Genre"])
  df["Modes"] = normalize_list(df["Modes"])

In [None]:
# Ensure 'Genre' and 'Modes' columns are properly normalized before calculating counts.
# This step replicates the intended functionality of the 'normalize_list' function from the previous cell,
# which was not applied due to its definition structure.
df["Genre"] = df["Genre"].fillna("").astype(str).str.lower().str.strip().str.replace(r"\s*,\s*", ",", regex = True)
df["Modes"] = df["Modes"].fillna("").astype(str).str.lower().str.strip().str.replace(r"\s*,\s*", ",", regex = True)

df["genre_count"] = df["Genre"].apply(
    lambda x: 0 if x == "" else len(x.split(","))
)

df["modes_count"] = df["Modes"].apply(
    lambda x: 0 if x == "" else len(x.split(","))
)

In [None]:
df[["Genre", "genre_count", "Modes", "modes_count"]].head(10)

Unnamed: 0,Genre,genre_count,Modes,modes_count
0,"shooter,rpg,adventure,action",4,"single player,multiplayer,co-operative",3
1,"shooter,arcade,action",3,"single player,multiplayer,co-operative",3
2,"action,rpg,stealth,parkour",4,"single player,multiplayer,co-operative",3
3,"action,adventure,shooter,fps",4,"single player,multiplayer",2
4,"action,rpg,stealth,parkour",4,single player,1
5,"action,rpg,stealth,parkour,vikings",5,single player,1
6,"action,adventure,co-op,fps",4,"single player,multipllayer,co-operative",3
7,"shooter,racing,action",3,"single player,multiplayer,co-operative",3
8,"action,rpg,stealth,parkour,adventure",5,"single player,multiplayer,co-operative",3
9,"action,adventure,fps",3,single player,1


In [None]:
top_genres = df["Genre"].str.split(",").explode().value_counts().head(5)
top_genres

top_genres_list = top_genres.index.tolist()
print(top_genres_list)

for genre in top_genres_list:
  col_name = f"has_{genre.replace(' ', '_').title()}"
  df[col_name] = df["Genre"].str.contains(genre, case = False).astype(int)

  df[[c for c in df.columns if c.startswith("has_")]].head(10)


['action', 'adventure', 'shooter', 'rpg', 'survival']


In [None]:
top_modes = df["Modes"].str.split(",").explode().value_counts().head(10)
top_modes

df["has_multiplayer"] = df["Modes"].str.contains("Multiplayer", case = False).astype(int)
df["has_coop"] = df["Modes"].str.contains("Co-operative", case = False).astype(int)

In [None]:
df["is_first_person"] = df["Player Persepective"].str.contains("First", case = False).fillna(False).astype(int)
df["is_third_person"] = df["Player Persepective"].str.contains("Third", case = False).fillna(False).astype(int)
df["is_isometric"] = df["Player Persepective"].str.contains("Isometric", case = False).fillna(False).astype(int)
df["is_vr"] = df["Player Persepective"].str.contains("Virtual Reality", case = False).fillna(False).astype(int)
df["is_side_view"] = df["Player Persepective"].str.contains("Side", case = False).fillna(False).astype(int)

In [None]:
df["Release Date"] = pd.to_datetime(df["Release Date"], dayfirst = True)
df["Release Date"]

Unnamed: 0,Release Date
0,2018-10-26
1,2013-09-17
2,2017-10-27
3,2012-11-29
4,2020-11-10
...,...
164,2003-10-14
165,2012-05-15
166,2018-05-08
167,2018-02-20


In [None]:
df = df.sort_values("Release Date")
df["Release Date"]

Unnamed: 0,Release Date
107,1994-03-25
61,1996-03-22
108,1996-08-31
74,1997-09-24
113,1997-10-10
...,...
147,2025-06-10
154,2025-09-18
125,2025-10-02
28,2025-11-09


In [None]:
df["franchise_avg_critic"] = df.groupby("Franchise")["critics_rating"].transform(
    lambda x: x.shift().expanding().mean()
)
df["franchise_avg_critic"]

df["franchise_avg_user"] = df.groupby("Franchise")["user_rating"].transform(
    lambda x: x.shift().expanding().mean()
)
df["franchise_avg_user"]

df["is_first_franchise_game"] = df["franchise_avg_critic"].isna().astype(int)

global_critic_mean = df["critics_rating"].mean()
global_user_mean = df["user_rating"].mean()
print(global_critic_mean, global_user_mean)

df["franchise_avg_critic"] = df["franchise_avg_critic"].fillna(global_critic_mean)
df["franchise_avg_user"] = df["franchise_avg_user"].fillna(global_user_mean)

df.to_csv("Games with avg ratings.csv", index = False)

7.699408284023669 7.367455621301775


In [None]:
df["Studio"] = (
    df["Studio"].fillna("").str.lower().str.strip().str.replace(r"\s*,\s*", ",", regex = True)
)

In [None]:
df_studio = df[["Name", "Studio", "Release Date", "critics_rating", "user_rating"]].copy()

df_studio = df_studio.assign(
    Studio = df_studio["Studio"].str.split(",")
).explode("Studio")

#df_studio.head(40)

In [None]:
df_studio = df_studio.sort_values("Release Date")

df_studio["studio_avg_critic"] = (
    df_studio.groupby("Studio")["critics_rating"].transform(
    lambda x: x.shift().expanding().mean()
    )
)

df_studio["studio_avg_user"] = (
    df_studio.groupby("Studio")["user_rating"].transform(
        lambda x: x.shift().expanding().mean()
    )
)

df_studio.head(25)

Unnamed: 0,Name,Studio,Release Date,critics_rating,user_rating,studio_avg_critic,studio_avg_user
107,The Elder Scrolls: Arena,bethesda softworks,1994-03-25,7.0,6.83,,
61,Resident Evil,capcom planning room 2,1996-03-22,8.9,8.6,,
108,The Elder Scrolls II: Daggerfall,bethesda game studios,1996-08-31,8.25,8.15,,
74,Postal,running with scissors,1997-09-24,2.8,7.2,,
113,Fallout: A Post Nuclear Role Playing Game,interplay entertainment,1997-10-10,9.2,9.05,,
113,Fallout: A Post Nuclear Role Playing Game,black isle studios,1997-10-10,9.2,9.05,,
112,An Elder Scrolls Legend: Battlespire,bethesda softworks,1997-11-30,5.7,6.0,7.0,6.83
62,Resident Evil 2,capcom planning room 2,1998-01-21,7.6,8.8,8.9,8.6
114,Fallout 2,black isle studios,1998-10-29,8.75,9.1,9.2,9.05
111,The Elder Scrolls Adventures: Redguard,bethesda softworks,1998-10-31,6.5,6.5,6.35,6.415


In [None]:
studio_agg = (
    df_studio.groupby("Name")[["studio_avg_critic", "studio_avg_user"]].mean().reset_index()
)

In [None]:
# Identify the columns that would be common and potentially cause conflict
common_cols = ["studio_avg_critic", "studio_avg_user"]

# Drop these columns from df if they already exist, to ensure a clean merge
for col in common_cols:
    if col in df.columns:
        df = df.drop(columns=[col])

# Now perform the merge. Since the conflicting columns are removed from df,
# studio_agg's columns can be added without suffixing issues.
df = df.merge(studio_agg, on="Name", how="left")
df.head(25)



Unnamed: 0,Name,Studio,Genre,Modes,Modding Support,Spinoff/Sequel,Open World,Player Persepective,Platforms,Platform Count,...,is_first_person,is_third_person,is_isometric,is_vr,is_side_view,franchise_avg_critic,franchise_avg_user,is_first_franchise_game,studio_avg_critic,studio_avg_user
0,The Elder Scrolls: Arena,bethesda softworks,"rpg,action,fantasy,sandbox",single player,No,No,Yes,First Person,PC,1,...,1,0,0,0,0,7.699408,7.367456,1,,
1,Resident Evil,capcom planning room 2,"adventure,action,horror,survival",single player,No,No,No,Third Person,"PC, Playstation",2,...,0,1,0,0,0,7.699408,7.367456,1,,
2,The Elder Scrolls II: Daggerfall,bethesda game studios,"rpg,adventure,action,fantasy,historical,sandbox",single player,Yes,Yes,Yes,"First Person, Third Person",DOS,1,...,1,1,0,0,0,7.0,6.83,0,,
3,Postal,running with scissors,"shooter,indie,action","single player,multiplayer",No,No,No,Bird view/Isometric,"Linux, Mac, PC",3,...,0,0,1,0,0,7.699408,7.367456,1,,
4,Fallout: A Post Nuclear Role Playing Game,"interplay entertainment,black isle studios","rpg,science fiction,survival,stealth",single player,Limited,No,No,Bird view/Isometric,"DOS, Linux, Mac, OnLive Game System, PC",5,...,0,0,1,0,0,7.699408,7.367456,1,,
5,An Elder Scrolls Legend: Battlespire,bethesda softworks,"rpg,action,fantasy","single player,multiplayer",No,Yes,No,First Person,DOS,1,...,1,0,0,0,0,7.625,7.49,0,7.0,6.83
6,Resident Evil 2,capcom planning room 2,"shooter,adventure,action,horror,survival",single player,No,Yes,No,Third Person,"Dreamcast, Nintendo GameCube, PC, Playstation,...",6,...,0,1,0,0,0,8.9,8.6,0,8.9,8.6
7,Fallout 2,black isle studios,"puzzle,rpg,science fiction,survival,stealth,co...",single player,Limited,Yes,No,Bird view/Isometric,"Mac, PC",2,...,0,0,1,0,0,9.2,9.05,0,9.2,9.05
8,The Elder Scrolls Adventures: Redguard,bethesda softworks,"rpg,adventure,action,fantasy,historical",single player,No,Yes,No,Third Person,"DOS, PC",2,...,0,1,0,0,0,6.983333,6.993333,0,6.35,6.415
9,Silent Hill,team silent,"puzzle,adventure,action,horror,survival",single player,Limited,No,No,Third Person,"Playstation, Playstation 3/Portable",3,...,0,1,0,0,0,7.699408,7.367456,1,,


In [None]:
global_critic_mean = df["critics_rating"].mean()
global_user_mean = df["user_rating"].mean()

df["studio_avg_critic"] = df["studio_avg_critic"].fillna(global_critic_mean)
df["studio_avg_user"] = df["studio_avg_user"].fillna(global_user_mean)

df["has_new_studio"] = df["studio_avg_critic"].isna().astype(int)

df.to_csv("games_engineered.csv", index = False)

In [None]:
from google.colab import files
#files.download("games_engineered.csv")

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn import ensemble
from sklearn.metrics import mean_squared_error


fdf = pd.read_csv("games_engineered.csv")

fdf["Open World"] = fdf["Open World"].map({"Yes": 1, "No": 0})
fdf["Spinoff/Sequel"] = fdf["Spinoff/Sequel"].map({"Yes": 1, "No": 0})

fdf["Modding Support"] = fdf["Modding Support"].map({
    "No": 0,
    "Limited": 1,
    "Yes": 2
})

In [None]:
base_features = [
    "is_first_franchise_game", "genre_count", "modes_count", "content_density", "Month", "price_per_completionist_hour", "price_per_story_hour", "Launch Price", "has_map_size", "Map Size(sqkm)", "Story Hours", "Extra Hours", "Completionist", "Open World", "Spinoff/Sequel", "unique_factor", "Modding Support", "is_first_person", "is_third_person", "is_isometric", "is_vr", "is_side_view", "has_Action", "has_Adventure", "has_Shooter", "has_Rpg", "has_multiplayer", "has_coop"
]

critic_features = base_features + [
    "studio_avg_critic",
    "franchise_avg_critic"
]

user_features = base_features + [
    "studio_avg_user",
    "franchise_avg_user"
]

In [None]:
X_all = fdf

X_train, X_test = train_test_split(
    X_all,
    test_size = 0.33,
    random_state = 42
)

X_train_critic = X_train[critic_features]
X_test_critic  = X_test[critic_features]

X_train_user = X_train[user_features]
X_test_user  = X_test[user_features]

y_train_critic = df.loc[X_train.index, "critics_rating"]
y_test_critic  = df.loc[X_test.index, "critics_rating"]

y_train_user = df.loc[X_train.index, "user_rating"]
y_test_user  = df.loc[X_test.index, "user_rating"]


In [None]:
import xgboost as xgb

price_cols = ["price_per_completionist_hour", "price_per_story_hour", "Launch Price"]

# Function to clean and convert price columns and fill NaNs
def preprocess_price_columns(df_features):
    for col in price_cols:
        if col in df_features.columns:
            df_features[col] = df_features[col].astype(str).str.replace('$', '', regex=False).replace('#DIV/0!', np.nan).replace('', np.nan).astype(float)

    # Fill NaNs for all numeric columns in the dataframe
    for col in df_features.columns:
        if df_features[col].dtype in ['float64', 'int64']:
            if df_features[col].isnull().any():
                median_val = df_features[col].median()
                df_features[col] = df_features[col].fillna(median_val)
    return df_features

# Apply preprocessing to critic and user feature sets
X_train_critic = preprocess_price_columns(X_train_critic.copy())
X_test_critic = preprocess_price_columns(X_test_critic.copy())
X_train_user = preprocess_price_columns(X_train_user.copy())
X_test_user = preprocess_price_columns(X_test_user.copy())

# Also ensure target variables don't have NaNs, if any exist
y_train_critic = y_train_critic.fillna(global_critic_mean)
y_test_critic = y_test_critic.fillna(global_critic_mean)
y_train_user = y_train_user.fillna(global_user_mean)
y_test_user = y_test_user.fillna(global_user_mean)

# Define params for GradientBoostingRegressor
params = {
    'n_estimators': 500,
    'max_depth': 4,
    'min_samples_split': 5,
    'learning_rate': 0.01,
    'loss': 'squared_error',
    'random_state': 42
}

gb_critic = ensemble.GradientBoostingRegressor(**params)
gb_critic.fit(X_train_critic, y_train_critic)

gb_user = ensemble.GradientBoostingRegressor(**params)
gb_user.fit(X_train_user, y_train_user)

# Separate RandomForestRegressor models for critic and user ratings
rf_critic = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    min_samples_leaf=5,
    random_state=42
)
rf_critic.fit(X_train_critic, y_train_critic)

rf_user = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    min_samples_leaf=5,
    random_state=42
)
rf_user.fit(X_train_user, y_train_user)

xgb_critic = xgb.XGBRegressor()
xgb_critic.fit(X_train_critic, y_train_critic)

xgb_user = xgb.XGBRegressor()
xgb_user.fit(X_train_user, y_train_user)

y_pred_critic_gb = gb_critic.predict(X_test_critic)
y_pred_user_gb = gb_user.predict(X_test_user)

y_pred_critic_rf = rf_critic.predict(X_test_critic)
y_pred_user_rf = rf_user.predict(X_test_user)

y_pred_critic_xgb = xgb_critic.predict(X_test_critic)
y_pred_user_xgb = xgb_user.predict(X_test_user)

comparison_df_critic = pd.DataFrame({'Actual Critic Rating GB': y_test_critic, 'Predicted Critic Rating GB': y_pred_critic_gb})
display(comparison_df_critic)

comparison_df_user = pd.DataFrame({'Actual User Rating GB': y_test_user, 'Predicted User Rating GB': y_pred_user_gb})
display(comparison_df_user)

comparison_df_critic = pd.DataFrame({'Actual Critic Rating RF': y_test_critic, 'Predicted Critic Rating RF': y_pred_critic_rf})
display(comparison_df_critic)

comparison_df_user = pd.DataFrame({'Actual User Rating RF': y_test_user, 'Predicted User Rating RF': y_pred_user_rf})
display(comparison_df_user)

Unnamed: 0,Actual Critic Rating GB,Predicted Critic Rating GB
138,8.65,8.39053
30,9.55,9.091853
119,6.25,5.675446
29,5.5,6.733689
143,7.3,7.481331
162,8.9,7.748339
165,7.45,7.588771
51,8.8,7.445984
105,7.75,8.650767
60,7.2,7.419304


Unnamed: 0,Actual User Rating GB,Predicted User Rating GB
138,9.08,8.194979
30,9.13,7.636648
119,3.1,6.552317
29,5.6,5.889682
143,7.3,3.716503
162,9.0,8.392536
165,8.03,7.641008
51,7.85,6.865501
105,7.55,7.453269
60,8.48,7.273604


Unnamed: 0,Actual Critic Rating RF,Predicted Critic Rating RF
138,8.65,8.152796
30,9.55,8.814826
119,6.25,6.919977
29,5.5,6.366781
143,7.3,7.501574
162,8.9,7.729865
165,7.45,7.551222
51,8.8,7.299981
105,7.75,8.272893
60,7.2,7.595658


Unnamed: 0,Actual User Rating RF,Predicted User Rating RF
138,9.08,8.347201
30,9.13,7.470024
119,3.1,7.152667
29,5.6,6.073155
143,7.3,5.456293
162,9.0,8.161192
165,8.03,7.16464
51,7.85,6.981118
105,7.55,7.81341
60,8.48,7.105519


In [None]:
feat_imp_gb_critic = pd.Series(
    gb_critic.feature_importances_,
    index=X_train_critic.columns
).sort_values(ascending=False)

print("Feature Importances for Critic Ratings (Gradient Boosting):")
display(feat_imp_gb_critic)

feat_imp_gb_user = pd.Series(
    gb_user.feature_importances_,
    index=X_train_user.columns
).sort_values(ascending=False)

print("\nFeature Importances for User Ratings (Gradient Boosting):")
display(feat_imp_gb_user)

Feature Importances for Critic Ratings (Gradient Boosting):


Unnamed: 0,0
Modding Support,0.220532
Launch Price,0.133534
studio_avg_critic,0.113847
genre_count,0.095992
franchise_avg_critic,0.084339
price_per_completionist_hour,0.066619
Story Hours,0.055655
Extra Hours,0.038352
price_per_story_hour,0.037642
Month,0.025977



Feature Importances for User Ratings (Gradient Boosting):


Unnamed: 0,0
Story Hours,0.220225
studio_avg_user,0.179196
Modding Support,0.114365
Month,0.081497
Launch Price,0.068322
price_per_story_hour,0.050312
unique_factor,0.043901
price_per_completionist_hour,0.04138
Extra Hours,0.038317
genre_count,0.037647


In [None]:
feat_imp_rf_critic = pd.Series(
    rf_critic.feature_importances_,
    index=X_train_critic.columns
).sort_values(ascending=False)

print("Feature Importances for Critic Ratings (Random Forest):")
display(feat_imp_rf_critic)

feat_imp_rf_user = pd.Series(
    rf_user.feature_importances_,
    index=X_train_user.columns
).sort_values(ascending=False)

print("\nFeature Importances for User Ratings (Random Forest):")
display(feat_imp_rf_user)

Feature Importances for Critic Ratings (Random Forest):


Unnamed: 0,0
Modding Support,0.230514
studio_avg_critic,0.107003
franchise_avg_critic,0.103854
Launch Price,0.091708
genre_count,0.085944
Story Hours,0.06787
price_per_completionist_hour,0.050282
Completionist,0.046818
is_third_person,0.038271
price_per_story_hour,0.031848



Feature Importances for User Ratings (Random Forest):


Unnamed: 0,0
studio_avg_user,0.207908
Story Hours,0.176122
Modding Support,0.105797
genre_count,0.065893
Month,0.054792
price_per_completionist_hour,0.054036
unique_factor,0.053776
franchise_avg_user,0.048789
Launch Price,0.042865
price_per_story_hour,0.033057


In [None]:
feat_imp_xgb_critic = pd.Series(
    xgb_critic.feature_importances_,
    index=X_train_critic.columns
).sort_values(ascending=False)

print("Feature Importances for Critic Ratings (XGBoost):")
display(feat_imp_xgb_critic)

feat_imp_xgb_user = pd.Series(
    xgb_user.feature_importances_,
    index=X_train_user.columns
).sort_values(ascending=False)

print("\nFeature Importances for User Ratings (XGBoost):")
display(feat_imp_xgb_user)

Feature Importances for Critic Ratings (XGBoost):


Unnamed: 0,0
Modding Support,0.3412191
Launch Price,0.176544
unique_factor,0.1011905
studio_avg_critic,0.05834715
franchise_avg_critic,0.05532679
is_third_person,0.05328961
has_Adventure,0.05246177
modes_count,0.02794412
price_per_story_hour,0.02018239
Extra Hours,0.02011232



Feature Importances for User Ratings (XGBoost):


Unnamed: 0,0
Modding Support,0.240182
has_map_size,0.171702
Story Hours,0.124135
unique_factor,0.118183
studio_avg_user,0.102715
Launch Price,0.035431
Month,0.028391
Open World,0.020665
modes_count,0.020328
price_per_story_hour,0.0193


In [None]:
print("Critic Ratings Metrics GB:")
print("MAE:", mean_absolute_error(y_test_critic, y_pred_critic_gb))
print("RMSE:", np.sqrt(mean_squared_error(y_test_critic, y_pred_critic_gb)))
print("R2:", r2_score(y_test_critic, y_pred_critic_gb))

print("\nUser Ratings Metrics GB:")
print("MAE:", mean_absolute_error(y_test_user, y_pred_user_gb))
print("RMSE:", np.sqrt(mean_squared_error(y_test_user, y_pred_user_gb)))
print("R2:", r2_score(y_test_user, y_pred_user_gb))

print("-----------------------------------")

print("Critic Ratings Metrics RF:")
print("MAE:", mean_absolute_error(y_test_critic, y_pred_critic_rf))
print("RMSE:", np.sqrt(mean_squared_error(y_test_critic, y_pred_critic_rf)))
print("R2:", r2_score(y_test_critic, y_pred_critic_rf))

print("\nUser Ratings Metrics RF:")
print("MAE:", mean_absolute_error(y_test_user, y_pred_user_rf))
print("RMSE:", np.sqrt(mean_squared_error(y_test_user, y_pred_user_rf)))
print("R2:", r2_score(y_test_user, y_pred_user_rf))

print("------------------------------------")

print("Critics Ratings Metrics XGB:")
print("MAE:", mean_absolute_error(y_test_critic, y_pred_critic_xgb))
print("RMSE:", np.sqrt(mean_squared_error(y_test_critic, y_pred_critic_xgb)))
print("R2:", r2_score(y_test_critic, y_pred_critic_xgb))

print("\nUser Ratings Metrics:")
print("MAE:", mean_absolute_error(y_test_user, y_pred_user_xgb))
print("RMSE:", np.sqrt(mean_squared_error(y_test_user, y_pred_user_xgb)))
print("R2:", r2_score(y_test_user, y_pred_user_xgb))



Critic Ratings Metrics GB:
MAE: 0.9291601860378507
RMSE: 1.3092254492702595
R2: 0.23223207250072275

User Ratings Metrics GB:
MAE: 0.9516569393008423
RMSE: 1.2229614139898166
R2: 0.16280789431396747
-----------------------------------
Critic Ratings Metrics RF:
MAE: 0.9161812886964794
RMSE: 1.1800860902476629
R2: 0.37622427259715363

User Ratings Metrics RF:
MAE: 0.8703104063940971
RMSE: 1.1499354088629747
R2: 0.25980438307479026
------------------------------------
Critics Ratings Metrics XGB:
MAE: 1.0547612735203333
RMSE: 1.5172193650898729
R2: -0.031093010846143798

User Ratings Metrics:
MAE: 1.0394216787815094
RMSE: 1.3754672811937414
R2: -0.059010200531876444


### Predict Ratings for a New Game

Edit the `user_input` dictionary below with the details of your new game. The script will then process these features and use the trained models to predict critic and user ratings.

In [None]:
# =========================================================================
# Define Lookup Dictionaries and Fallback Global Means
# =========================================================================

# Ensure df_studio's averages are filled for accurate dictionary creation
df_studio_for_dicts = df_studio.copy()
df_studio_for_dicts["studio_avg_critic"] = df_studio_for_dicts["studio_avg_critic"].fillna(global_critic_mean)
df_studio_for_dicts["studio_avg_user"] = df_studio_for_dicts["studio_avg_user"].fillna(global_user_mean)

# Studio average dictionaries (keys are lowercased studio names)
studio_avg_critic_dict = df_studio_for_dicts.groupby('Studio')['studio_avg_critic'].mean().to_dict()
studio_avg_user_dict = df_studio_for_dicts.groupby('Studio')['studio_avg_user'].mean().to_dict()

# Franchise average dictionaries (keys are case-sensitive franchise names as in df)
franchise_avg_critic_dict = df.groupby('Franchise')['franchise_avg_critic'].mean().to_dict()
franchise_avg_user_dict = df.groupby('Franchise')['franchise_avg_user'].mean().to_dict()

# Predefined feature lists from previous cells (ensure these match the training features)
# Re-define these here for clarity and to ensure they are available in this cell's scope
base_features = [
    "is_first_franchise_game", "genre_count", "modes_count", "content_density", "Month",
    "price_per_completionist_hour", "price_per_story_hour", "Launch Price", "has_map_size",
    "Map Size(sqkm)", "Story Hours", "Extra Hours", "Completionist", "Open World",
    "Spinoff/Sequel", "unique_factor", "Modding Support", "is_first_person", "is_third_person",
    "is_isometric", "is_vr", "is_side_view", "has_Action", "has_Adventure",
    "has_Shooter", "has_Rpg", "has_multiplayer", "has_coop"
]
critic_features = base_features + ["studio_avg_critic", "franchise_avg_critic"]
user_features = base_features + ["studio_avg_user", "franchise_avg_user"]

# =========================
# USER INPUT (EDIT THIS DICTIONARY)
# =========================

user_input = {
    "game_name": "Epic Masterpiece Title",
    "studio": "Rockstar Games",
    "franchise": "Grand Theft Auto",
    "launch_price": 69.99,
    "release_year": 2025,
    "release_month": 10, # October is a prime release month
    "genres": ["Action", "Adventure", "RPG", "Open World", "Crime", "Shooter"], # Popular and diverse genres
    "modes": ["Single Player", "Multiplayer", "Co-operative"], # Comprehensive modes
    "platforms": ["PC", "Playstation 5", "Xbox Series X/S"], # Broad platform availability
    "open_world": 1,
    "sequel": 1, # Leveraging an established successful franchise
    "story_hours": 60,
    "extra_hours": 150,
    "completionist_hours": 250,
    "map_size": 250,  # Large open world map
    "modding_support": "yes",  # Full modding support
    "player_perspective": ["Third Person", "First Person"], # Flexible perspective options
    "unique_factor": 1 # Assuming a positive 'unique_factor' for innovative or high-quality games
}

# =========================
# DERIVED FEATURES
# =========================

# Counts
genre_count = len(user_input["genres"])
modes_count = len(user_input["modes"])

# Value metrics
price_per_story = user_input["launch_price"] / user_input["story_hours"] if user_input["story_hours"] > 0 else 0
price_per_completionist = user_input["launch_price"] / user_input["completionist_hours"] if user_input["completionist_hours"] > 0 else 0

# Content density
has_map_size = int(user_input["map_size"] > 0)
content_density = (
    user_input["completionist_hours"] / user_input["map_size"]
    if has_map_size and user_input["map_size"] > 0 else 0
)

# Modding encoding
modding_map = {"no": 0, "limited": 1, "yes": 2}
modding_encoded = modding_map.get(user_input["modding_support"].lower(), 0)

# Studio / Franchise averages (safe fallback using correct global mean variables)
predicted_studio_avg_critic = studio_avg_critic_dict.get(
    user_input["studio"].lower(), global_critic_mean
)
predicted_studio_avg_user = studio_avg_user_dict.get(
    user_input["studio"].lower(), global_user_mean
)

predicted_franchise_avg_critic = franchise_avg_critic_dict.get(
    user_input["franchise"], global_critic_mean
)
predicted_franchise_avg_user = franchise_avg_user_dict.get(
    user_input["franchise"], global_user_mean
)

# Genre flags (based on top 5 genres, adjust if more are needed)
genre_flags = {
    "has_Action": int("action" in [g.lower() for g in user_input["genres"]]),
    "has_Adventure": int("adventure" in [g.lower() for g in user_input["genres"]]),
    "has_Shooter": int("shooter" in [g.lower() for g in user_input["genres"]]),
    "has_Rpg": int("rpg" in [g.lower() for g in user_input["genres"]]),
    "has_Stealth": int("stealth" in [g.lower() for g in user_input["genres"]]),
}

# Mode flags
mode_flags = {
    "has_multiplayer": int("multiplayer" in [m.lower() for m in user_input["modes"]]),
    "has_coop": int("co-operative" in [m.lower() for m in user_input["modes"]])
}

# Player perspective flags
p_perspectives = [p.lower() for p in user_input["player_perspective"]]
perspective_flags = {
    "is_first_person": int("first person" in p_perspectives),
    "is_third_person": int("third person" in p_perspectives),
    "is_isometric": int("isometric" in p_perspectives),
    "is_vr": int("virtual reality" in p_perspectives),
    "is_side_view": int("side" in p_perspectives or "side-view" in p_perspectives),
}

# Is first franchise game
is_first_franchise_game = int(user_input["franchise"] not in franchise_avg_critic_dict)

# Month
month = user_input["release_month"]

# `unique_factor` is not derivable from user_input alone, so setting to 0 as a default.
# If it represents a specific characteristic, its calculation logic should be added.
unique_factor = user_input["unique_factor"]

# =========================
# FINAL MODEL INPUT ROWS
# =========================

# Common data for both critic and user model inputs
common_model_data = {
    "is_first_franchise_game": is_first_franchise_game,
    "genre_count": genre_count,
    "modes_count": modes_count,
    "content_density": content_density,
    "Month": month,
    "price_per_completionist_hour": price_per_completionist,
    "price_per_story_hour": price_per_story,
    "Launch Price": user_input["launch_price"],
    "has_map_size": has_map_size,
    "Map Size(sqkm)": user_input["map_size"],
    "Story Hours": user_input["story_hours"],
    "Extra Hours": user_input["extra_hours"],
    "Completionist": user_input["completionist_hours"],
    "Open World": user_input["open_world"],
    "Spinoff/Sequel": user_input["sequel"],
    "unique_factor": unique_factor,
    "Modding Support": modding_encoded,
    **perspective_flags,
    **genre_flags,
    **mode_flags,
}

# Model input for Critic Ratings
model_input_critic_data = {
    **common_model_data,
    "studio_avg_critic": predicted_studio_avg_critic,
    "franchise_avg_critic": predicted_franchise_avg_critic,
}
model_input_critic = pd.DataFrame([model_input_critic_data])[critic_features]

# Model input for User Ratings
model_input_user_data = {
    **common_model_data,
    "studio_avg_user": predicted_studio_avg_user,
    "franchise_avg_user": predicted_franchise_avg_user,
}
model_input_user = pd.DataFrame([model_input_user_data])[user_features]

# =========================
# PREDICTION
# =========================

predicted_critic_rating = rf_critic.predict(model_input_critic)[0]
predicted_user_rating = rf_user.predict(model_input_user)[0]

print("🎮 Game:", user_input["game_name"])
print(f"⭐ Predicted Critic Rating: {predicted_critic_rating:.2f} / 10")
print(f"👥 Predicted User Rating: {predicted_user_rating:.2f} / 10")

🎮 Game: Epic Masterpiece Title
⭐ Predicted Critic Rating: 8.58 / 10
👥 Predicted User Rating: 8.21 / 10


Based on the predictions from our models and the input features, here's an interpretation of what seems to make a game low-rated versus high-rated:

What makes a game High-Rated (e.g., 'Cyberpunk 2077: Phantom Liberty'):

Strong Developer/Franchise History: Games from established studios (CD Projekt Red) and well-known franchises (Cyberpunk) tend to score higher. The models use studio_avg_critic, studio_avg_user, franchise_avg_critic, and franchise_avg_user as very important features, indicating that past performance is a strong predictor.
Content-Rich: A good amount of Story Hours, Extra Hours, and high Completionist hours suggest a game with substantial content. This correlates with lower price_per_completionist_hour and price_per_story_hour, which are important factors for user ratings.
Modding Support: The presence of Modding Support (especially 'yes') is a significant positive factor for critic ratings and somewhat for user ratings, indicating community engagement and longevity.
Popular Genres: Including popular genres like 'RPG', 'Action', 'Adventure', and 'Open World' can contribute positively.
Open World Design: Games with Open World elements and a measurable Map Size often offer more exploration and content.
Sequel/Spinoff: Being a Spinoff/Sequel implies an existing player base and often refined mechanics, which can lead to higher ratings.
What makes a game Low-Rated (e.g., 'Generic Bad Game 2024'):

Unknown Developer/Franchise: If the Studio or Franchise is new or unknown, the models fall back to global average ratings, which tend to be lower than those of successful, established entities.
Poor Value for Money: A high Launch Price coupled with very low Story Hours, Extra Hours, and Completionist hours results in a very high price_per_story_hour and price_per_completionist_hour. This is a strong negative indicator, particularly for user ratings.
Limited Content/Features: Low genre_count, few modes (e.g., only single player), no Modding Support, and no Open World features can contribute to lower scores.
Niche/Unpopular Genres: Less mainstream or experimental genres can appeal to a smaller audience and potentially receive lower overall ratings.
Release Month: While less impactful than other features, the Month of release can have some influence, with January often being a quieter month for major releases.
In essence, both critics and users value content, value for money, and the reputation of the developer/franchise. User ratings tend to be more sensitive to gameplay length and value (price per hour), while critic ratings place a slightly higher emphasis on modding support and established studios.

Based on our model's analysis, here are key strategies a developer should consider to maximize their game's chances of being highly-rated:

Build on Strong Foundations (or establish them early):

Leverage Franchise and Studio Reputation: If possible, develop within an established, successful franchise. The model heavily values franchise_avg_critic, franchise_avg_user, studio_avg_critic, and studio_avg_user. A strong track record builds trust and expectations.
If New, Focus on Quality: For new IPs or studios, exceptional quality in the first release is crucial to start building a positive reputation that future titles can benefit from.
Prioritize Content and Value for Money:

Extensive Gameplay Hours: Offer substantial Story Hours, Extra Hours (side quests, collectibles), and Completionist content. Gamers and critics appreciate games that offer many hours of engagement.
Strategic Pricing: Ensure the Launch Price aligns with the amount of content provided. A good price_per_story_hour and price_per_completionist_hour is vital, especially for user ratings. High price for low content is a strong negative indicator.
Content Density: In open-world games, ensure the Map Size is densely packed with meaningful activities to maintain a good content_density ratio.
Embrace Community and Features:

Modding Support: Offering Modding Support significantly boosts critic ratings and community engagement. It extends the game's lifespan and allows players to customize their experience.
Diverse Modes: Include Single Player, Multiplayer, and Co-operative modes where appropriate. This broadens appeal and caters to different player preferences.
Target Popular and Diverse Genres:

Mainstream Appeal: Incorporate elements from popular genres like Action, Adventure, RPG, and Open World. While niche genres have their place, broader appeal can lead to higher overall scores.
Genre Blending: Thoughtfully combine genres to create a rich and varied gameplay experience, as indicated by a higher genre_count for highly-rated games.
Consider Player Perspective and Open World Design:

Flexible Perspectives: Offering multiple Player Perspectives (e.g., First Person, Third Person) can enhance player immersion and choice.
Meaningful Open Worlds: If designing an Open World game, ensure the world is engaging, reactive, and offers genuine exploration opportunities, rather than just being large for the sake of it.
Strategic Release Timing:

Avoid Saturated Periods (if new): While not the strongest factor, Month of release can play a minor role. Avoiding historically quiet months (like January) or extremely crowded ones (like Holiday season) without a strong IP might be beneficial for newer titles.
In summary, developers should focus on creating a content-rich, value-driven experience, backed by a strong or emerging brand, with features that encourage engagement and longevity, all while strategically considering genre appeal and market positioning.