In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import os
os.chdir(r"C:\Users\Capta\Documents\GitHub\Game-Sales-Project")

In [6]:
path = "data/pixel_game_sales_final.csv"  
games = pd.read_csv(path)

print(games.shape)
print(games.columns)
games.head(3)

(334, 15)
Index(['steamId', 'name', 'firstReleaseDate', 'copiesSold', 'price', 'revenue',
       'avgPlaytime', 'publisherClass', 'publishers', 'developers', 'steamUrl',
       'review_count', 'review_score', 'followers', 'tags'],
      dtype='object')


Unnamed: 0,steamId,name,firstReleaseDate,copiesSold,price,revenue,avgPlaytime,publisherClass,publishers,developers,steamUrl,review_count,review_score,followers,tags
0,619820,Heroes of Hammerwatch II,2025-01-14,376913,17.99,5654229.0,21,AA,['Team17'],['Crackshell'],https://store.steampowered.com/app/619820,4566,85,32093,"['2D', 'Action', 'Action RPG', 'Action Rogueli..."
1,638510,dotAGE,2023-10-04,97326,17.59,1384600.0,17,Indie,['Michele Pirovano'],['Michele Pirovano'],https://store.steampowered.com/app/638510,2433,95,14123,"['2D', 'Board Game', 'Building', 'Choices Matt..."
2,813230,ANIMAL WELL,2024-05-09,650925,24.99,12568206.0,7,Indie,['Bigmode'],['Billy Basso'],https://store.steampowered.com/app/813230,21706,96,54687,"['2D', 'Action', 'Adventure', 'Atmospheric', '..."


In [None]:
def parse_list(x):
    if pd.isna(x):
        return []
    if isinstance(x, list):
        return x
    s = str(x).strip()
    try:
        val = ast.literal_eval(s)
        return val if isinstance(val, list) else []
    except Exception:
        return []

for col in ["tags", "publishers", "developers"]:
    if col in games.columns:
        games[col] = games[col].apply(parse_list)

print(type(games.loc[0, "tags"]), games.loc[0, "tags"][:8])
print(type(games.loc[0, "publishers"]), games.loc[0, "publishers"])

<class 'list'> ['2D', 'Action', 'Action RPG', 'Action Roguelike', 'Adventure', 'Character Customization', 'Class-Based', 'Combat']
<class 'list'> ['Team17']


In [8]:
pairs = games.copy()

pairs = pairs.explode("publishers", ignore_index=True)
pairs["publisher"] = pairs["publishers"].fillna("").astype(str).str.strip()
pairs = pairs[pairs["publisher"] != ""].drop(columns=["publishers"])

print(pairs.shape)
pairs[["steamId", "name", "publisher"]].head(10)

(386, 15)


Unnamed: 0,steamId,name,publisher
0,619820,Heroes of Hammerwatch II,Team17
1,638510,dotAGE,Michele Pirovano
2,813230,ANIMAL WELL,Bigmode
3,866020,Growtopia,Ubisoft
4,1012570,Knuckle Sandwich,Dinko
5,1012570,Knuckle Sandwich,SUPERHOT PRESENTS
6,1098770,The Crimson Diamond,Julia Minamata
7,1147860,UFO 50,Mossmouth
8,1148350,The Powder Toy,The Powder Toy Team
9,1157740,Iron Meat,Retroware


In [9]:
bad = {"na", "n/a", "none", "unknown"}
pairs = pairs[~pairs["publisher"].str.lower().isin(bad)]

In [10]:
print("Unique publishers:", pairs["publisher"].nunique())
print("Games:", pairs["steamId"].nunique())
pairs["publisher"].value_counts().head(15)

Unique publishers: 308
Games: 334


publisher
Gamersky Games          9
Gamirror Games          6
PLAYISM                 5
INSTINCT3               5
Alawar                  4
Team17                  4
Playstack               4
Raw Fury                4
Devolver Digital        4
Yogscast Games          3
Critical Bliss          3
Future Friends Games    3
HypeTrain Digital       3
KONAMI                  3
indie.io                3
Name: count, dtype: int64

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix, hstack, vstack

pairs = pairs.copy()

pairs["tag_text"] = pairs["tags"].apply(lambda x: " ".join(x))
pairs[["publisher", "tag_text"]].head()

Unnamed: 0,publisher,tag_text
0,Team17,2D Action Action RPG Action Roguelike Adventur...
1,Michele Pirovano,2D Board Game Building Choices Matter City Bui...
2,Bigmode,2D Action Adventure Atmospheric Controller Cut...
3,Ubisoft,2D Adventure Building Casual Character Customi...
4,Dinko,2D Colorful Comedy Cute Dark Exploration Fanta...


In [None]:
tfidf = TfidfVectorizer(
    min_df=3,       
    max_df=0.8,    
    ngram_range=(1,1)
)

X_tags = tfidf.fit_transform(pairs["tag_text"])

print(X_tags.shape)

(386, 299)


In [18]:
publishers = pairs["publisher"].values
unique_publishers = np.unique(publishers)

publisher_vectors = {}
publisher_counts = {}

for pub in unique_publishers:
    idx = np.where(publishers == pub)[0]

    vec = X_tags[idx].mean(axis=0)
    vec = np.asarray(vec).ravel()   

    publisher_vectors[pub] = vec
    publisher_counts[pub] = len(idx)

print("Publisher profiles:", len(publisher_vectors))

Publisher profiles: 308


In [20]:
def recommend_publishers(game_tags, top_k=5):
    tag_text = " ".join(game_tags)

    game_vec = tfidf.transform([tag_text])
    game_vec = np.asarray(game_vec.todense()).ravel()  

    scores = []
    for pub, vec in publisher_vectors.items():
        sim = cosine_similarity(
            game_vec.reshape(1, -1),
            vec.reshape(1, -1)
        )[0, 0]
        scores.append((pub, sim))

    return sorted(scores, key=lambda x: x[1], reverse=True)[:top_k]

In [21]:
test_tags = [
    "Roguelike",
    "Deckbuilding",
    "Card Game",
    "Pixel Graphics",
    "Strategy"
]

recommend_publishers(test_tags, top_k=10)

[('Yogscast Games', np.float64(0.6273831007522628)),
 ('Gamera Games', np.float64(0.6158626574917645)),
 ('BOBO GAMES', np.float64(0.5763222867281235)),
 ('iFAction Studio', np.float64(0.5763222867281235)),
 ('Hooded Horse', np.float64(0.5576259971595474)),
 ('Morax Games', np.float64(0.5536405646231961)),
 ('XO Cat', np.float64(0.5403143364416743)),
 ('The Arcade Crew', np.float64(0.5037205828037915)),
 ('Pixel Purrfect', np.float64(0.46562058337307055)),
 ('Ravenage Games', np.float64(0.4594138914469104))]

In [22]:
def recommend_publishers_df(game_tags, top_k=10):
    out = recommend_publishers(game_tags, top_k=top_k)
    recs = pd.DataFrame(out, columns=["publisher", "similarity"])
    recs["games_in_profile"] = recs["publisher"].map(publisher_counts).fillna(0).astype(int)
    return recs

recommend_publishers_df(test_tags, top_k=10)

Unnamed: 0,publisher,similarity,games_in_profile
0,Yogscast Games,0.627383,3
1,Gamera Games,0.615863,1
2,BOBO GAMES,0.576322,1
3,iFAction Studio,0.576322,1
4,Hooded Horse,0.557626,2
5,Morax Games,0.553641,1
6,XO Cat,0.540314,1
7,The Arcade Crew,0.503721,1
8,Pixel Purrfect,0.465621,1
9,Ravenage Games,0.459414,2


In [None]:
def explain_recommendations(game_tags, top_k=10, examples_per_pub=3):
    tag_text = " ".join(game_tags)
    game_vec = tfidf.transform([tag_text])  

    recs = recommend_publishers_df(game_tags, top_k=top_k)

    rows = []
    for pub in recs["publisher"]:
        pub_idx = pairs.index[pairs["publisher"] == pub].to_numpy()
        if len(pub_idx) == 0:
            rows.append((pub, []))
            continue

        sims = cosine_similarity(game_vec, X_tags[pub_idx]).ravel()

        # take top examples
        top_local = pub_idx[np.argsort(sims)[::-1][:examples_per_pub]]
        examples = pairs.loc[top_local, ["name", "steamId", "steamUrl"]].drop_duplicates().head(examples_per_pub)

        rows.append((pub, list(examples["name"].values)))

    recs["example_games"] = [ex for _, ex in rows]
    return recs

explain_recommendations(test_tags, top_k=10, examples_per_pub=3)

Unnamed: 0,publisher,similarity,games_in_profile,example_games
0,Yogscast Games,0.627383,3,"[Dungeons & Degenerate Gamblers, Stray Path, B..."
1,Gamera Games,0.615863,1,[RUNGORE]
2,BOBO GAMES,0.576322,1,[Big Winner]
3,iFAction Studio,0.576322,1,[Big Winner]
4,Hooded Horse,0.557626,2,"[He is Coming, 9 Kings]"
5,Morax Games,0.553641,1,[Villages & Dungeons]
6,XO Cat,0.540314,1,[Aotenjo: Infinite Hands]
7,The Arcade Crew,0.503721,1,[Cross Blitz]
8,Pixel Purrfect,0.465621,1,[Terracards]
9,Ravenage Games,0.459414,2,"[Heretic's Fork, Megaloot]"


In [None]:
pairs = pairs.copy()
pairs["tag_text"] = pairs["tags"].apply(lambda x: " ".join(x) if isinstance(x, list) else "")
tfidf = TfidfVectorizer(min_df=3, max_df=0.8)
X_tags = tfidf.fit_transform(pairs["tag_text"])   
print("X_tags:", X_tags.shape)

X_tags: (358, 296)


Unnamed: 0,steamId,name,firstReleaseDate,copiesSold,price,revenue,avgPlaytime,publisherClass,developers,steamUrl,review_count,review_score,followers,tags,publisher,tag_text,log_review_count,log_followers
0,619820,Heroes of Hammerwatch II,2025-01-14,376913,17.99,5654229.0,21,AA,[Crackshell],https://store.steampowered.com/app/619820,4566,85,32093,"[2D, Action, Action RPG, Action Roguelike, Adv...",Team17,2D Action Action RPG Action Roguelike Adventur...,8.426612,10.376424
1,638510,dotAGE,2023-10-04,97326,17.59,1384600.0,17,Indie,[Michele Pirovano],https://store.steampowered.com/app/638510,2433,95,14123,"[2D, Board Game, Building, Choices Matter, Cit...",Michele Pirovano,2D Board Game Building Choices Matter City Bui...,7.797291,9.555631
2,813230,ANIMAL WELL,2024-05-09,650925,24.99,12568206.0,7,Indie,[Billy Basso],https://store.steampowered.com/app/813230,21706,96,54687,"[2D, Action, Adventure, Atmospheric, Controlle...",Bigmode,2D Action Adventure Atmospheric Controller Cut...,9.985390,10.909400
4,1012570,Knuckle Sandwich,2023-11-22,9469,19.99,161206.0,8,Indie,[Andy Brophy],https://store.steampowered.com/app/1012570,658,86,3835,"[2D, Colorful, Comedy, Cute, Dark, Exploration...",Dinko,2D Colorful Comedy Cute Dark Exploration Fanta...,6.490724,8.252185
5,1012570,Knuckle Sandwich,2023-11-22,9469,19.99,161206.0,8,Indie,[Andy Brophy],https://store.steampowered.com/app/1012570,658,86,3835,"[2D, Colorful, Comedy, Cute, Dark, Exploration...",SUPERHOT PRESENTS,2D Colorful Comedy Cute Dark Exploration Fanta...,6.490724,8.252185
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,3533730,"Warhammer 40,000: Boltgun - Words of Vengeance",2025-05-22,167428,0.00,0.0,0,Indie,[Auroch Digital],https://store.steampowered.com/app/3533730,1637,87,2639,"[1990's, Action, Arcade, Arena Shooter, Atmosp...",Auroch Digital,1990's Action Arcade Arena Shooter Atmospheric...,7.401231,7.878534
381,3553210,Auto Rogue,2025-05-23,29376,6.99,167139.0,7,Indie,[定期的な宝物],https://store.steampowered.com/app/3553210,526,93,1928,"[2D, Auto Battler, Automation, Choices Matter,...",定期的な宝物,2D Auto Battler Automation Choices Matter Comb...,6.267201,7.564757
383,3585630,this game will end in 205 clicks.,2025-08-05,10342,0.00,157.0,0,Indie,[insertdisc5],https://store.steampowered.com/app/3585630,528,98,240,"[Anime, Casual, Colorful, Free to Play, Indie,...",insertdisc5,Anime Casual Colorful Free to Play Indie Inter...,6.270988,5.484797
384,3689520,Bioprototype,2025-05-01,69676,4.99,292747.0,4,Indie,[Emprom Game],https://store.steampowered.com/app/3689520,928,91,2040,"[2D, Action, Auto Battler, Automation, Deckbui...",Emprom Game,2D Action Auto Battler Automation Deckbuilding...,6.834109,7.621195


In [None]:

pairs["price"] = pd.to_numeric(pairs["price"], errors="coerce")
pairs["price"] = pairs["price"].fillna(pairs["price"].median())

scaler_price = StandardScaler(with_mean=False)
X_price = scaler_price.fit_transform(pairs[["price"]].to_numpy())  
X_price = csr_matrix(X_price)

PRICE_WEIGHT = 0.35  
X_all = hstack([X_tags, PRICE_WEIGHT * X_price]).tocsr()

print("X_all (tags+price):", X_all.shape)


X_all (tags+price): (358, 297)


In [None]:
publishers = pairs["publisher"].astype(str).values
unique_publishers, counts = np.unique(publishers, return_counts=True)
publisher_counts = dict(zip(unique_publishers, counts))

MIN_GAMES = 3
kept_publishers = [p for p in unique_publishers if publisher_counts[p] >= MIN_GAMES]

profiles = []
for pub in kept_publishers:
    idx = np.where(publishers == pub)[0]
    vec = X_all[idx].sum(axis=0) / len(idx)
    profiles.append(csr_matrix(vec))

P = vstack(profiles).tocsr()  

print("Publishers kept:", len(kept_publishers))
print("P:", P.shape)


Publishers kept: 16
P: (16, 297)


In [None]:
def recommend_publishers_tags_price(game_tags, price=None, top_k=10):
    tag_text = " ".join(game_tags)
    v_tags = tfidf.transform([tag_text])

    price_val = pairs["price"].median() if price is None else float(price)
    v_price = scaler_price.transform([[price_val]])
    v_price = csr_matrix(v_price)

    v_all = hstack([v_tags, PRICE_WEIGHT * v_price]).tocsr()

    sims = cosine_similarity(v_all, P).ravel()
    top_idx = np.argsort(sims)[::-1][:top_k]

    return [(kept_publishers[i], float(sims[i]), int(publisher_counts[kept_publishers[i]])) for i in top_idx]

In [56]:
def confidence_label(n):
    if n >= 6:
        return "High"
    elif n >= 3:
        return "Medium"
    else:
        return "Exploratory"

out = recommend_publishers_tags_price(test_tags, price=14.99, top_k=10)
df_out = pd.DataFrame(out, columns=["publisher", "similarity", "games_in_profile"])
df_out["confidence"] = df_out["games_in_profile"].apply(confidence_label)
df_out

Unnamed: 0,publisher,similarity,games_in_profile,confidence
0,Yogscast Games,0.732454,3,Medium
1,Gamersky Games,0.608013,9,High
2,INSTINCT3,0.594314,5,Medium
3,Raw Fury,0.584635,4,Medium
4,Playstack,0.553122,4,Medium
5,Gamirror Games,0.51842,6,High
6,HypeTrain Digital,0.514308,3,Medium
7,Alawar,0.462459,4,Medium
8,indie.io,0.457984,3,Medium
9,Team17,0.431219,4,Medium


In [None]:
def explain_publishers_tags_price(game_tags, price, top_k=10, examples_per_pub=3):
    recs = recommend_publishers_tags_price(game_tags, price=price, top_k=top_k)
    recs_df = pd.DataFrame(recs, columns=["publisher", "similarity", "games_in_profile"])

    tag_text = " ".join(game_tags)
    v_tags = tfidf.transform([tag_text])

    v_price = scaler_price.transform([[float(price)]])
    v_price = csr_matrix(v_price)

    v_all = hstack([v_tags, PRICE_WEIGHT * v_price]).tocsr()

    example_lists = []
    for pub in recs_df["publisher"]:
        idx = pairs.index[pairs["publisher"] == pub].to_numpy()
        if len(idx) == 0:
            example_lists.append([])
            continue

        sims = cosine_similarity(v_all, X_all[idx]).ravel()
        top_local = idx[np.argsort(sims)[::-1]]

        ex = (
            pairs.loc[top_local, ["steamId", "name", "steamUrl", "price"]]
                 .drop_duplicates(subset=["steamId"])
                 .head(examples_per_pub)
        )
        
        example_lists.append(ex.to_dict(orient="records"))

    recs_df["examples"] = example_lists
    return recs_df


In [None]:
test_tags = ["Roguelike", "Deckbuilding", "Card Game", "Pixel Graphics", "Strategy"]
def pretty_print_explanations(df):
    for _, row in df.iterrows():
        print(f"\n{row['publisher']}  |  sim={row['similarity']:.3f}  |  games={row['games_in_profile']}")
        for ex in row["examples"]:
            print(f"  - {ex['name']} (${ex['price']})")
            print(f"    {ex['steamUrl']}")

df_exp = explain_publishers_tags_price(test_tags, price=14.99, top_k=10, examples_per_pub=3)
pretty_print_explanations(df_exp)


Yogscast Games  |  sim=0.732  |  games=3
  - Border Pioneer ($14.99)
    https://store.steampowered.com/app/2346410
  - Stray Path ($14.99)
    https://store.steampowered.com/app/2531940
  - Dungeons & Degenerate Gamblers ($14.99)
    https://store.steampowered.com/app/2400510

Gamersky Games  |  sim=0.608  |  games=9
  - Dwarves: Glory, Death and Loot ($14.99)
    https://store.steampowered.com/app/2205850
  - I Am Legion: Stand Survivors ($8.99)
    https://store.steampowered.com/app/3109580
  - Seer's Gambit ($14.99)
    https://store.steampowered.com/app/2219450

INSTINCT3  |  sim=0.594  |  games=5
  - Megaloot ($9.99)
    https://store.steampowered.com/app/2440380
  - Asgard's Fall — Viking Survivors ($6.99)
    https://store.steampowered.com/app/2780710
  - Die in the Dungeon ($12.99)
    https://store.steampowered.com/app/2026820

Raw Fury  |  sim=0.585  |  games=4
  - Moonstone Island ($19.99)
    https://store.steampowered.com/app/1658150
  - Pizza Possum ($6.99)
    https://

In [None]:
pairs.to_csv("pixel_games_sales_pairs.csv", index=False)