In [1]:
import random
from pathlib import Path

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import matplotlib as mpl

In [2]:
dataset_path = './data/vgsales.csv'
drop_sales = True

In [31]:
df = pd.read_csv(dataset_path)
df = df[df['Year'].notna()]
df

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


In [39]:
df_processed = pd.get_dummies(df, columns=['Platform', 'Genre', 'Publisher'])
df_processed = df_processed.drop(columns=['Name', 'Rank'])
if drop_sales:
    df_processed = df_processed.drop(columns=['Global_Sales', 'Other_Sales', 'JP_Sales', 'EU_Sales', 'NA_Sales'])
df_processed

Unnamed: 0,Year,Platform_2600,Platform_3DO,Platform_3DS,Platform_DC,Platform_DS,Platform_GB,Platform_GBA,Platform_GC,Platform_GEN,...,Publisher_Zushi Games,Publisher_bitComposer Games,Publisher_dramatic create,Publisher_fonfun,Publisher_iWin,Publisher_id Software,Publisher_imageepoch Inc.,Publisher_inXile Entertainment,"Publisher_mixi, Inc",Publisher_responDESIGN
0,2006.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1985.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2008.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,2009.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1996.0,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16593,2002.0,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
16594,2003.0,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
16595,2008.0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
16596,2010.0,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [40]:
knn = NearestNeighbors(n_neighbors=5, radius=10)
knn.fit(df_processed)
    
# cos_sim = cosine_similarity(df_processed)
# cos_sim

In [41]:
def description(array: np.ndarray) -> np.ndarray:
    print(f"shape: {array.shape}, mean: {array.mean()}, std: {array.std()}, min: {array.min()}, max: {array.max()}")
# description(cos_sim)

In [42]:
def heatmap(array: np.ndarray) -> np.ndarray:
    fig, ax = plt.subplots()
    # im = ax.imshow()
    
    ax.set_xticks(np.arange(len(df)), labels=df['Name'])
    ax.set_yticks(np.arange(len(df)), labels=df['Name'])
    
    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
    
    # Loop over data dimensions and create text annotations.
    for i, vec in enumerate(cos_sim):
        for j, x in enumerate(vec):
            text = ax.text(j, i, x, ha="center", va="center", color="w")
    
    ax.set_title("Harvest of local farmers (in tons/year)")
    fig.tight_layout()
    plt.show()
# heatmap(cos_sim)

In [43]:
def normalize(matrix: np.ndarray) -> np.ndarray:
    new_matrix = matrix - matrix.min()

    new_matrix *= 1 / new_matrix.max()
    return new_matrix

# cos_sim = normalize(cos_sim)
# description(cos_sim)

In [44]:
def recomend(name: str, top: int = 10):
    # print(df[df['Name'] == name])
    idx = df[df['Name'] == name].index[0]
    score_series = pd.Series(cos_sim[idx]).sort_values(ascending = False)

    top = top + 1
    top_indexes = list(score_series.iloc[0:top].index)
    
    for i in top_indexes:
        yield df['Name'].iloc[i], score_series[i]
        

def recomend_knn(target_name: str, top: int = 10):
    target_idx = df[df['Name'] == target_name].index[0]
    row = df_processed.iloc[target_idx]

    distances, idxs = knn.kneighbors([row], top+1, return_distance=True)
    distances, idxs = distances[0], idxs[0]
    
    for i, idx in enumerate(idxs):
        yield df['Name'].iloc[idx], distances[i]

    return []
    
best_sellers = df.sort_values(by='Global_Sales', ascending=False)[:200]
for i in range(10):
    target_name = best_sellers['Name'].iloc[np.random.randint(0, len(best_sellers))]
    print(target_name)
    for predict_name, conf in recomend_knn(target_name, top=4):
        if predict_name == target_name:
            continue
        print(f'{round(conf, 5)}: {predict_name}')
    print()

Gran Turismo
0.0: Ray Tracers
0.0: Turbo Prop Racing
0.0: Rally Cross
0.0: Jet Moto 2

Call of Duty 4: Modern Warfare
0.0: The History Channel: Battle for the Pacific
0.0: Soldier of Fortune: Payback
1.0: The History Channel: Civil War - A Nation Divided
1.0: Call of Duty: World at War

Gears of War 3
0.0: Halo: Combat Evolved Anniversary
0.0: The Gunstringer
1.0: Halo: Reach
1.0: Crackdown 2

GoldenEye 007
0.0: Star Fox 64
1.41421: Blast Corps
1.41421: Yoshi's Story
1.41421: Doom 64

Pokemon Stadium
0.0: Command & Conquer
1.0: Pokémon Stadium 2
1.0: Starcraft 64
1.0: Pocket Monsters Stadium

Call of Duty: Modern Warfare 3
0.0: Goldeneye 007: Reloaded
1.0: Call of Duty: Black Ops
1.0: Call of Duty: Black Ops II
1.0: James Bond 007: Blood Stone

New Super Mario Bros.
0.0: Densetsu no Stafi 4
0.0: Yoshi's Island DS
0.0: Kirby Squeak Squad
1.0: Donkey Kong Jungle Climber

Super Mario Bros. 3
0.0: Donkey Kong Classics
0.0: Super Mario Bros. 2
1.41421: Tetris
1.41421: Ice Hockey

Grand Thef

