In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, Normalizer, KBinsDiscretizer
from pycaret.arules import *
from tensorflow.keras.utils import to_categorical

In [2]:
sales = pd.read_csv('C:/Users/tnort/Documents/Datasets/vgsales.csv')
sales.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [3]:
sales.isnull().sum()

Rank              0
Name              0
Platform          0
Year            271
Genre             0
Publisher        58
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64

In [4]:
sales.describe()

Unnamed: 0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,16598.0,16327.0,16598.0,16598.0,16598.0,16598.0,16598.0
mean,8300.605254,2006.406443,0.264667,0.146652,0.077782,0.048063,0.537441
std,4791.853933,5.828981,0.816683,0.505351,0.309291,0.188588,1.555028
min,1.0,1980.0,0.0,0.0,0.0,0.0,0.01
25%,4151.25,2003.0,0.0,0.0,0.0,0.0,0.06
50%,8300.5,2007.0,0.08,0.02,0.0,0.01,0.17
75%,12449.75,2010.0,0.24,0.11,0.04,0.04,0.47
max,16600.0,2020.0,41.49,29.02,10.22,10.57,82.74


In [5]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16598 non-null  int64  
 1   Name          16598 non-null  object 
 2   Platform      16598 non-null  object 
 3   Year          16327 non-null  float64
 4   Genre         16598 non-null  object 
 5   Publisher     16540 non-null  object 
 6   NA_Sales      16598 non-null  float64
 7   EU_Sales      16598 non-null  float64
 8   JP_Sales      16598 non-null  float64
 9   Other_Sales   16598 non-null  float64
 10  Global_Sales  16598 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB


In [6]:
sales.shape

(16598, 11)

In [7]:
sales.dropna(subset = ['Year', 'Publisher'], inplace =True)

In [8]:
sales.shape

(16291, 11)

In [9]:
sales.isnull().sum()

Rank            0
Name            0
Platform        0
Year            0
Genre           0
Publisher       0
NA_Sales        0
EU_Sales        0
JP_Sales        0
Other_Sales     0
Global_Sales    0
dtype: int64

In [10]:
Platforms = sales[['Platform']]
Genres = sales[['Genre']]
Publishers = sales[['Publisher']]

NA_Sales = sales[['NA_Sales']]
EU_Sales = sales[['EU_Sales']]
JP_Sales = sales[['JP_Sales']]
Other_Sales = sales[['Other_Sales']]
Global_Sales = sales[['Global_Sales']]

enc = OneHotEncoder()
Platforms = enc.fit_transform(Platforms).toarray()
Genres = enc.fit_transform(Genres).toarray()
Publishers = enc.fit_transform(Publishers).toarray()


In [11]:
print('-----------------Platforms transformed---------------------------\n\n', Platforms,
 '\n\n', '-----------------Genres transformed---------------------------\n\n', Genres,
  '\n\n', '-----------------Publishers transformed---------------------------\n\n', Publishers)

-----------------Platforms transformed---------------------------

 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] 

 -----------------Genres transformed---------------------------

 [[0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]] 

 -----------------Publishers transformed---------------------------

 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [12]:
kbin = KBinsDiscretizer(n_bins = 5, encode = 'onehot')

na_bin = kbin.fit_transform(NA_Sales).toarray()
eu_bin = kbin.fit_transform(EU_Sales).toarray()
jp_bin = kbin.fit_transform(JP_Sales).toarray()
other_bin = kbin.fit_transform(Other_Sales).toarray()
global_bin = kbin.fit_transform(Global_Sales).toarray()

In [13]:
global_bin

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.]])

In [14]:
sales['Platform'] = Platforms
sales['Genre'] = Genres
sales['Publisher'] = Publishers
sales["NA_Sales"] = na_bin
sales["EU_Sales"] = eu_bin
sales["JP_Sales"] = jp_bin
sales["Other_Sales"] = other_bin
sales["Global_Sales"] = global_bin

In [15]:
sales = setup(data = sales, transaction_id = 'Rank', item_id = 'Name')

Description,Value
session_id,5588.0
# Transactions,16291.0
# Items,11325.0
Ignore Items,


In [16]:
arules = create_model()

ValueError: cannot call `vectorize` on size 0 inputs unless `otypes` is set

In [None]:
plot_model(arules, plot = '3d')