In [1]:
import random
from pathlib import Path
from typing import Generator
import pickle

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from difflib import get_close_matches
import matplotlib.pyplot as plt
import matplotlib
import matplotlib as mpl

In [2]:
data = Path('data')
dataset_path = data / 'vgsales.csv'
drop_sales = True

In [3]:
df = pd.read_csv(dataset_path)
df = df[df['Year'].notna()]
df

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


In [4]:
df_processed = pd.get_dummies(df, columns=['Platform', 'Genre', 'Publisher'])
df_processed = df_processed.drop(columns=['Name', 'Rank'])
if drop_sales:
    df_processed = df_processed.drop(columns=['Other_Sales', 'JP_Sales', 'EU_Sales', 'NA_Sales'])
df_processed

Unnamed: 0,Year,Global_Sales,Platform_2600,Platform_3DO,Platform_3DS,Platform_DC,Platform_DS,Platform_GB,Platform_GBA,Platform_GC,...,Publisher_Zushi Games,Publisher_bitComposer Games,Publisher_dramatic create,Publisher_fonfun,Publisher_iWin,Publisher_id Software,Publisher_imageepoch Inc.,Publisher_inXile Entertainment,"Publisher_mixi, Inc",Publisher_responDESIGN
0,2006.0,82.74,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1985.0,40.24,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2008.0,35.82,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,2009.0,33.00,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1996.0,31.37,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16593,2002.0,0.01,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
16594,2003.0,0.01,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
16595,2008.0,0.01,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
16596,2010.0,0.01,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [5]:
knn = NearestNeighbors(p=1)
knn.fit(df_processed)
    
cos_sim = cosine_similarity(df_processed)
cos_sim

array([[1.        , 0.99977998, 0.99972631, ..., 0.99914992, 0.99914992,
        0.99914992],
       [0.99977998, 1.        , 0.99999654, ..., 0.99979393, 0.99979393,
        0.99979418],
       [0.99972631, 0.99999654, 1.        , ..., 0.99984052, 0.99984027,
        0.99984027],
       ...,
       [0.99914992, 0.99979393, 0.99984052, ..., 1.        , 0.99999926,
        0.99999925],
       [0.99914992, 0.99979393, 0.99984027, ..., 0.99999926, 1.        ,
        0.99999925],
       [0.99914992, 0.99979418, 0.99984027, ..., 0.99999925, 0.99999925,
        1.        ]])

In [6]:
# dump
with open(data / 'cos_sim.pkl', 'wb') as f:
    pickle.dump(cos_sim, f)

# load 
with open(data / 'cos_sim.pkl', 'rb') as f:
    cos_sim = pickle.load(f)

In [7]:
def description(array: np.ndarray) -> np.ndarray:
    print(f"shape: {array.shape}, mean: {array.mean()}, std: {array.std()}, min: {array.min()}, max: {array.max()}")
description(cos_sim)

shape: (16327, 16327), mean: 0.9999986979877649, std: 1.0565496435416577e-05, min: 0.9991499141445698, max: 1.0000000000000004


In [8]:
def normalize(matrix: np.ndarray) -> np.ndarray:
    new_matrix = matrix - matrix.min()

    new_matrix *= 1 / new_matrix.max()
    return new_matrix

cos_sim = normalize(cos_sim)
description(cos_sim)

shape: (16327, 16327), mean: 0.9984683755977214, std: 0.012428740424181975, min: 0.0, max: 1.0


In [9]:
_names = list(df['Name'].unique())
def find_names(name: str, count: int) -> list[str]:
    return get_close_matches(name, _names, n=count)

def find_name(name: str) -> str:
    found_names = find_names(name, count=1)
    if len(found_names) == 0:
        raise KeyError(f"Name not found: {name}")
    return found_names[0]

find_names("Call of Duty: Modern Warfare", count=3), find_name("Need for Speed")

(['Call of Duty: Modern Warfare 3',
  'Call of Duty: Modern Warfare 2',
  'Call of Duty 4: Modern Warfare'],
 'Need for Speed: Shift')

In [10]:
def recomend_cos(target_name: str, top: int = 10):
    idx = df[df['Name'] == target_name].index[0]
    score_series = pd.Series(cos_sim[idx]).sort_values(ascending = False)

    top = top + 1
    top_indexes = list(score_series.iloc[0:top].index)
    
    for i in top_indexes:
        yield df['Name'].iloc[i], score_series[i]
        

def recomend_knn(target_name: str, top: int = 10):
    target_idx = df[df['Name'] == target_name].index[0]
    row = df_processed.iloc[target_idx]

    distances, idxs = knn.kneighbors([row], top+1, return_distance=True)
    distances, idxs = distances[0], idxs[0]
    
    for i, idx in enumerate(idxs):
        yield df['Name'].iloc[idx], distances[i]

In [11]:
def closest_to_best(n: int, best: int) -> None:
    best_sellers = df.sort_values(by='Global_Sales', ascending=False)[:best]
    for i in range(n):
        target_name = best_sellers['Name'].iloc[np.random.randint(0, len(best_sellers))]
        print(target_name)
        for predict_name, conf in recomend_cos(target_name, top=4): # recomend_knn
            if predict_name == target_name:
                continue
            print(f'{round(conf, 5)}: {predict_name}')
        print()

closest_to_best(n=4, best=100)

Super Mario 3D Land
0.99986: New Super Mario Bros. 2
0.9997: Super Mario All-Stars
0.99969: Super Mario 64
0.99967: Pokemon Omega Ruby/Pokemon Alpha Sapphire

Call of Duty: Modern Warfare 2
0.99999: Call of Duty: Black Ops II
0.99982: Call of Duty: Black Ops
0.99978: Call of Duty: Modern Warfare 3
0.99971: Call of Duty: Modern Warfare 3

Grand Theft Auto III
0.9995: Grand Theft Auto V
0.99919: Pokemon HeartGold/Pokemon SoulSilver
0.99912: Super Smash Bros. Brawl
0.99911: Call of Duty: Modern Warfare 3

Pokemon Ruby/Pokemon Sapphire
0.99966: Pokemon Black/Pokemon White
0.9995: Pokémon Yellow: Special Pikachu Edition
0.99937: Brain Age 2: More Training in Minutes a Day
0.99934: Pokemon X/Pokemon Y



In [22]:
def closest_to(name: str, n: int) -> None:
    target_name = find_name(name)
    print(target_name)

    for predict_name, conf in recomend_cos(target_name, top=n): # recomend_knn
        if predict_name == target_name:
            continue
        print(f'{round(conf, 5)}: {predict_name}')

closest_to(name="Super Mario", n=5)

Super Mario 64
0.99968: Super Mario Galaxy
0.9995: Super Mario 3D Land
0.99945: Super Mario All-Stars
0.99941: Pokemon HeartGold/Pokemon SoulSilver
0.99941: Mario Kart 7


In [21]:
closest_to(name="Call of Duty: Black Ops", n=5)

Call of Duty: Black Ops
1.0: Call of Duty: Modern Warfare 3
0.99988: Call of Duty: Black Ops II
0.99982: Call of Duty: Modern Warfare 2
0.99968: Call of Duty: Black Ops 3
0.99965: Call of Duty: Black Ops II


In [14]:
def best_from(count: int, field: str, name: str) -> list[str]:
    return list(df[df[field] == name].sort_values(by='Global_Sales', ascending=False).Name[:count])

best_from(count=10, field='Publisher', name='Activision')

['Call of Duty: Modern Warfare 3',
 'Call of Duty: Black Ops',
 'Call of Duty: Black Ops 3',
 'Call of Duty: Black Ops II',
 'Call of Duty: Black Ops II',
 'Call of Duty: Modern Warfare 2',
 'Call of Duty: Modern Warfare 3',
 'Call of Duty: Black Ops',
 'Call of Duty: Modern Warfare 2',
 'Call of Duty: Ghosts']

In [15]:
best_from(count=10, field='Genre', name='Strategy')

['Pokemon Stadium',
 'Warzone 2100',
 'StarCraft II: Wings of Liberty',
 'Warcraft II: Tides of Darkness',
 'Pokémon Trading Card Game',
 'Command & Conquer: Red Alert',
 'Pokémon Stadium 2',
 'Halo Wars',
 'Theme Hospital',
 'Warcraft: Orcs & Humans']

In [16]:
best_from(count=10, field='Platform', name='PC')

['The Sims 3',
 'World of Warcraft',
 'Diablo III',
 'Microsoft Flight Simulator',
 'StarCraft II: Wings of Liberty',
 'Warcraft II: Tides of Darkness',
 'Half-Life',
 'World of Warcraft: The Burning Crusade',
 'The Elder Scrolls V: Skyrim',
 'The Sims: Unleashed']