In [17]:
import random
from pathlib import Path
from typing import Generator
import pickle

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from difflib import get_close_matches
import matplotlib.pyplot as plt
import matplotlib
from sklearn import preprocessing
import matplotlib as mpl

In [2]:
data = Path('data')
dataset_path = data / 'vgsales.csv'
drop_sales = True

In [3]:
df = pd.read_csv(dataset_path)
df = df[df['Year'].notna()]
df

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


In [19]:
df_processed = pd.get_dummies(df, columns=['Platform', 'Genre', 'Publisher'])
df_processed = df_processed.drop(columns=['Name', 'Rank'])
if drop_sales:
    df_processed = df_processed.drop(columns=['Other_Sales', 'JP_Sales', 'EU_Sales', 'NA_Sales'])
    
x = df_processed.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_processed = pd.DataFrame(x_scaled)

df_processed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,611,612,613,614,615,616,617,618,619,620
0,0.650,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.125,0.486281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.700,0.432854,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.725,0.398767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.400,0.379064,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16322,0.550,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16323,0.575,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16324,0.700,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16325,0.750,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
knn = NearestNeighbors(p=1)
knn.fit(df_processed)
    
cos_sim = cosine_similarity(df_processed)
cos_sim

array([[1.        , 0.41333348, 0.71609901, ..., 0.11581499, 0.12281835,
        0.09738328],
       [0.41333348, 1.        , 0.37533674, ..., 0.0259725 , 0.02754307,
        0.32568648],
       [0.71609901, 0.37533674, 1.        , ..., 0.41591572, 0.14504864,
        0.11500979],
       ...,
       [0.11581499, 0.0259725 , 0.41591572, ..., 1.        , 0.14889124,
        0.11805661],
       [0.12281835, 0.02754307, 0.14504864, ..., 0.14889124, 1.        ,
        0.12519552],
       [0.09738328, 0.32568648, 0.11500979, ..., 0.11805661, 0.12519552,
        1.        ]])

In [35]:
# dump
with open(data / 'knn.pkl', 'wb') as f:
    pickle.dump(knn, f)

# load 
with open(data / 'knn.pkl', 'rb') as f:
    knn = pickle.load(f)

# dump
with open(data / 'cos_sim.pkl', 'wb') as f:
    pickle.dump(cos_sim, f)

# load 
with open(data / 'cos_sim.pkl', 'rb') as f:
    cos_sim = pickle.load(f)

In [36]:
def description(array: np.ndarray) -> np.ndarray:
    print(f"shape: {array.shape}, mean: {array.mean()}, std: {array.std()}, min: {array.min()}, max: {array.max()}")
description(cos_sim)

shape: (16327, 16327), mean: 0.18682284917544034, std: 0.13690587890534453, min: 0.0, max: 1.0000000000000004


In [37]:
def normalize(matrix: np.ndarray) -> np.ndarray:
    new_matrix = matrix - matrix.min()

    new_matrix *= 1 / new_matrix.max()
    return new_matrix

cos_sim = normalize(cos_sim)
description(cos_sim)

shape: (16327, 16327), mean: 0.1868228491754403, std: 0.13690587890534447, min: 0.0, max: 1.0


In [38]:
_names = list(df['Name'].unique())
def find_names(name: str, count: int) -> list[str]:
    return get_close_matches(name, _names, n=count)

def find_name(name: str) -> str:
    found_names = find_names(name, count=1)
    if len(found_names) == 0:
        raise KeyError(f"Name not found: {name}")
    return found_names[0]

find_names("Call of Duty: Modern Warfare", count=3), find_name("Need for Speed")

(['Call of Duty: Modern Warfare 3',
  'Call of Duty: Modern Warfare 2',
  'Call of Duty 4: Modern Warfare'],
 'Need for Speed: Shift')

In [39]:
def recomend_cos(target_name: str, top: int = 10):
    idx = df[df['Name'] == target_name].index[0]
    score_series = pd.Series(cos_sim[idx]).sort_values(ascending = False)

    top = top + 1
    top_indexes = list(score_series.iloc[0:top].index)
    
    for i in top_indexes:
        yield df['Name'].iloc[i], score_series[i]
        

def recomend_knn(target_name: str, top: int = 10):
    target_idx = df[df['Name'] == target_name].index[0]
    row = df_processed.iloc[target_idx]

    distances, idxs = knn.kneighbors([row], top+1, return_distance=True)
    distances, idxs = distances[0], idxs[0]
    
    for i, idx in enumerate(idxs):
        yield df['Name'].iloc[idx], distances[i]

In [42]:
def closest_to_best(n: int, best: int) -> None:
    best_sellers = df.sort_values(by='Global_Sales', ascending=False)[:best]
    for i in range(n):
        target_name = best_sellers['Name'].iloc[np.random.randint(0, len(best_sellers))]
        print(target_name)
        for predict_name, conf in recomend_knn(target_name, top=4): # recomend_knn
            if predict_name == target_name:
                continue
            print(f'{round(conf, 5)}: {predict_name}')
        print()

closest_to_best(n=4, best=100)

Tetris
0.30122: Dr. Mario
0.34232: Alleyway
0.37687: Qix
0.40694: Yoshi

Wii Fit Plus
0.0587: Wii Fit
0.13296: Wii Sports Resort
0.24804: New Play Control! Mario Power Tennis
0.26542: Mario Sports Mix

The Legend of Zelda: Ocarina of Time
0.0991: Mario Golf
0.10125: The Legend of Zelda: Majora's Mask
0.10828: Blast Corps
0.10983: Star Wars: Shadows of the Empire

Halo 4
0.05145: Halo: Reach
0.06755: Gears of War 3
0.11348: Halo: Combat Evolved Anniversary
0.1161: Halo 3: ODST



In [48]:
def closest_to(name: str, n: int) -> None:
    target_name = find_name(name)
    print(target_name)

    for predict_name, conf in recomend_cos(target_name, top=n): # recomend_knn
        if predict_name == target_name:
            continue
        print(f'{round(conf, 5)}: {predict_name}')

closest_to(name="Super Mario", n=5)

Super Mario 64
0.99815: Donkey Kong 64
0.99806: Banjo-Kazooie
0.99803: Yoshi's Story
0.99689: Mischief Makers
0.99663: Bomberman Hero


In [49]:
closest_to(name="Call of Duty: Black Ops", n=5)

Call of Duty: Black Ops
0.99993: Call of Duty: Modern Warfare 3
0.9999: Call of Duty: Modern Warfare 2
0.99969: Call of Duty: Black Ops II
0.99893: Call of Duty: Ghosts
0.99878: Call of Duty 4: Modern Warfare


In [14]:
def best_from(count: int, field: str, name: str) -> list[str]:
    return list(df[df[field] == name].sort_values(by='Global_Sales', ascending=False).Name[:count])

best_from(count=10, field='Publisher', name='Activision')

['Call of Duty: Modern Warfare 3',
 'Call of Duty: Black Ops',
 'Call of Duty: Black Ops 3',
 'Call of Duty: Black Ops II',
 'Call of Duty: Black Ops II',
 'Call of Duty: Modern Warfare 2',
 'Call of Duty: Modern Warfare 3',
 'Call of Duty: Black Ops',
 'Call of Duty: Modern Warfare 2',
 'Call of Duty: Ghosts']

In [15]:
best_from(count=10, field='Genre', name='Strategy')

['Pokemon Stadium',
 'Warzone 2100',
 'StarCraft II: Wings of Liberty',
 'Warcraft II: Tides of Darkness',
 'Pokémon Trading Card Game',
 'Command & Conquer: Red Alert',
 'Pokémon Stadium 2',
 'Halo Wars',
 'Theme Hospital',
 'Warcraft: Orcs & Humans']

In [16]:
best_from(count=10, field='Platform', name='PC')

['The Sims 3',
 'World of Warcraft',
 'Diablo III',
 'Microsoft Flight Simulator',
 'StarCraft II: Wings of Liberty',
 'Warcraft II: Tides of Darkness',
 'Half-Life',
 'World of Warcraft: The Burning Crusade',
 'The Elder Scrolls V: Skyrim',
 'The Sims: Unleashed']