In [20]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
pd.set_option('display.max_columns', 500)

In [26]:
data = pd.read_csv('./vgsales_ml.csv')
data.head()

Unnamed: 0,Name,Rank,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,Wii Sports,1,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,Super Mario Bros.,2,NES,1985.0,Pltform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,Mario Kart Wii,3,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,Wii Sports Resort,4,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,Pokemon Red/Pokemon Blue,5,GB,1996.0,Role_Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [27]:
len(data.Genre.unique())

12

In [28]:
data.Genre.value_counts()

Action          3251
Sports          2304
Misc            1686
Role_Playing    1470
Shooter         1282
Adventure       1274
Racing          1225
Pltform          875
Simulation       848
Fighting         836
Strategy         670
Puzzle           570
Name: Genre, dtype: int64

In [29]:
data = data[['Rank','Year','Genre']]
data.head()

Unnamed: 0,Rank,Year,Genre
0,1,2006.0,Sports
1,2,1985.0,Pltform
2,3,2008.0,Racing
3,4,2009.0,Sports
4,5,1996.0,Role_Playing


In [6]:
kmeans = KMeans(n_clusters=4)

In [7]:
datadumm = pd.get_dummies(data=data, columns=['Genre'])
datadumm.head()

Unnamed: 0,Rank,Year,Genre_Action,Genre_Adventure,Genre_Fighting,Genre_Misc,Genre_Pltform,Genre_Puzzle,Genre_Racing,Genre_Role_Playing,Genre_Shooter,Genre_Simulation,Genre_Sports,Genre_Strategy
0,1,2006.0,0,0,0,0,0,0,0,0,0,0,1,0
1,2,1985.0,0,0,0,0,1,0,0,0,0,0,0,0
2,3,2008.0,0,0,0,0,0,0,1,0,0,0,0,0
3,4,2009.0,0,0,0,0,0,0,0,0,0,0,1,0
4,5,1996.0,0,0,0,0,0,0,0,1,0,0,0,0


In [8]:
data_group = datadumm.groupby(['Rank','Year']).sum()
data_group.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Genre_Action,Genre_Adventure,Genre_Fighting,Genre_Misc,Genre_Pltform,Genre_Puzzle,Genre_Racing,Genre_Role_Playing,Genre_Shooter,Genre_Simulation,Genre_Sports,Genre_Strategy
Rank,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,2006.0,0,0,0,0,0,0,0,0,0,0,1,0
2,1985.0,0,0,0,0,1,0,0,0,0,0,0,0
3,2008.0,0,0,0,0,0,0,1,0,0,0,0,0
4,2009.0,0,0,0,0,0,0,0,0,0,0,1,0
5,1996.0,0,0,0,0,0,0,0,1,0,0,0,0


In [9]:
data_group.reset_index(level=['Year'], inplace=True)

In [10]:
data_group.Year.value_counts().head()

2009.0    1431
2008.0    1428
2010.0    1257
2007.0    1201
2011.0    1136
Name: Year, dtype: int64

In [11]:
pca = PCA(n_components=4)

principalComponents = pca.fit_transform(data_group)
principalDf = pd.DataFrame(data = principalComponents
             ,columns = ['pc1', 'pc2', 'pc3', 'pc4'])
principalDf.head()

Unnamed: 0,pc1,pc2,pc3,pc4
0,0.41059,-0.414082,0.798237,0.084421
1,21.409143,0.07349,-0.184807,-0.036396
2,-1.589647,-0.161388,-0.189724,-0.158821
3,-2.589171,-0.441173,0.804753,0.078322
4,10.404203,-0.096881,-0.281697,-0.462592


In [12]:
kmeans = KMeans(n_clusters=5)
data_clusters = kmeans.fit(principalDf)
data_clusters.cluster_centers_

array([[-6.57634996e+00,  5.91335049e-02,  3.77017641e-03,
        -2.72350805e-02],
       [ 3.18767213e+00, -2.89644877e-04,  1.82525241e-02,
        -6.88659880e-03],
       [ 2.20325625e+01,  3.26215737e-01, -1.54082143e-02,
         1.69866650e-02],
       [-1.70953064e+00, -4.03412239e-02, -1.26557711e-02,
         2.74495624e-02],
       [ 9.24610873e+00, -2.14582457e-02,  4.61407240e-05,
        -2.18112258e-02]])

In [13]:
data_group['labels'] = data_clusters.fit_predict(principalDf)
data_group.reset_index('Rank', inplace=True)
data_merged = pd.merge(data, data_group[['Rank', 'labels']], on='Rank', how='outer')
data_merged.head()

Unnamed: 0,Rank,Year,Genre,labels
0,1,2006.0,Sports,2
1,2,1985.0,Pltform,4
2,3,2008.0,Racing,2
3,4,2009.0,Sports,2
4,5,1996.0,Role_Playing,1


In [16]:
data_merged.labels.value_counts()

2    6325
3    4115
0    3877
1    1712
4     262
Name: labels, dtype: int64

In [17]:
pd.crosstab(data_group.Year,data_group.labels)

labels,0,1,2,3,4
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1980.0,0,0,0,0,9
1981.0,0,0,0,0,46
1982.0,0,0,0,0,36
1983.0,0,0,0,0,17
1984.0,0,0,0,0,14
1985.0,0,0,0,0,14
1986.0,0,0,0,0,21
1987.0,0,0,0,0,16
1988.0,0,0,0,0,15
1989.0,0,0,0,0,17


In [18]:
a = data_merged.groupby(['labels']).Genre.value_counts()
b = a.to_frame("counts").reset_index()
b.set_index("Genre", inplace=True)
b.groupby('labels').counts.nlargest(5)

labels  Genre       
0       Action          1212
        Role_Playing     454
        Adventure        388
        Sports           384
        Misc             362
1       Sports           294
        Fighting         192
        Racing           180
        Role_Playing     165
        Action           154
2       Action          1114
        Misc             879
        Sports           875
        Adventure        616
        Role_Playing     531
3       Sports           718
        Action           697
        Racing           474
        Shooter          392
        Pltform          380
4       Action            74
        Pltform           42
        Shooter           33
        Sports            33
        Puzzle            25
Name: counts, dtype: int64