In [66]:
import pandas as pd
import pandas_profiling
import seaborn as sb
import matplotlib.pyplot as plt 
import numpy as np
import plotly.express as pltx

In [67]:
rawdata = pd.read_csv('vgsales.csv')

In [68]:
newdata = rawdata.dropna()

target = newdata[['Global_Sales']].copy()

Q1 = target.quantile(0.25)
Q3 = target.quantile(0.75)

rule = ((target<(Q1-1.5*(Q3-Q1))) | (target>(Q3+1.5*(Q3-Q1))))

outliers = rule.any(axis = 1)

outlierindices = outliers.index[outliers == True]

newdata.drop(index=outlierindices, inplace = True)

In [69]:
data = newdata.drop(columns = ['Rank', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'])

In [70]:
data.head()

Unnamed: 0,Name,Platform,Year,Genre,Publisher,Global_Sales
1847,Guitar Hero 5,PS3,2009.0,Misc,Activision,1.1
1848,Megamania,Atari2600,1981.0,Shooter,Activision,1.1
1849,Medal of Honor: Airborne,X360,2007.0,Shooter,Electronic Arts,1.1
1850,The Legend of Zelda: Tri Force Heroes,3DS,2015.0,Action,Nintendo,1.1
1851,Jungle Hunt,Atari2600,1982.0,Platform,Atari,1.1


In [71]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

platnum = le.fit_transform(data['Platform'])
gennum = le.fit_transform(data['Genre'])
pubnum = le.fit_transform(data['Publisher'])

In [72]:
#encoded_plat = pd.DataFrame(platnum)
data['encoded_Platform'] = platnum
data['encoded_Genre'] = gennum
data['encoded_Publisher'] = pubnum
data['Global_Sales'] = data['Global_Sales'].apply(lambda x: x*1000)
print(data)


                                                   Name   Platform    Year  \
1847                                      Guitar Hero 5        PS3  2009.0   
1848                                          Megamania  Atari2600  1981.0   
1849                           Medal of Honor: Airborne       X360  2007.0   
1850              The Legend of Zelda: Tri Force Heroes        3DS  2015.0   
1851                                        Jungle Hunt  Atari2600  1982.0   
...                                                 ...        ...     ...   
16591                Woody Woodpecker in Crazy Castle 5        GBA  2002.0   
16592                     Men in Black II: Alien Escape         GC  2003.0   
16593  SCORE International Baja 1000: The Official Game        PS2  2008.0   
16594                                        Know How 2         DS  2010.0   
16595                                  Spirits & Spells        GBA  2003.0   

          Genre        Publisher  Global_Sales  encoded_Platfor

In [73]:
data['encoded_Platform'].value_counts()

4     2009
16    1835
26    1156
19    1150
17    1088
28    1029
15    1014
13     876
29     756
6      723
7      500
1      454
20     403
10     270
18     268
23     194
21     171
30     171
27     127
2       98
3       47
5       46
11      32
8       16
12      12
25       6
22       5
0        3
24       2
9        1
14       1
Name: encoded_Platform, dtype: int64

In [74]:
data['encoded_Genre'].value_counts()

0     2881
10    2042
3     1536
7     1281
1     1238
6     1069
8     1055
9      769
2      732
4      699
11     642
5      519
Name: encoded_Genre, dtype: int64

In [75]:
data['encoded_Publisher'].value_counts()

136    1043
344     885
21      822
518     820
272     768
       ... 
371       1
332       1
402       1
426       1
517       1
Name: encoded_Publisher, Length: 568, dtype: int64

In [76]:
import plotly.express as pltx

## Platform vs Sales

In [91]:
pltx.scatter(data, 'Platform', 'Global_Sales')

In [78]:
data['Platform'].value_counts()

DS           2009
PS2          1835
Wii          1156
PSP          1150
PS3          1088
X360         1029
PS           1014
PC            876
XB            756
GBA           723
GC            500
3DS           454
PSV           403
N64           270
PS4           268
SNES          194
SAT           171
XOne          171
WiiU          127
Atari2600      98
DC             47
GB             46
NES            32
GEN            16
NG             12
WS              6
SCD             5
3DO             3
TG16            2
GG              1
PCFX            1
Name: Platform, dtype: int64

## Year vs Sales

In [92]:
pltx.scatter(data, 'Year', 'Global_Sales', trendline = 'ols')

In [80]:
data['Year'].value_counts()

2009.0    1331
2008.0    1285
2010.0    1146
2007.0    1077
2011.0    1035
2006.0     927
2005.0     857
2002.0     740
2003.0     688
2004.0     645
2012.0     584
2015.0     556
2014.0     504
2013.0     464
2001.0     401
2016.0     328
1998.0     312
2000.0     302
1999.0     285
1997.0     240
1996.0     225
1995.0     201
1994.0     101
1993.0      46
1981.0      37
1982.0      31
1991.0      31
1992.0      24
1983.0      11
1987.0       9
1985.0       9
1989.0       6
1980.0       6
1986.0       5
1984.0       4
1988.0       4
2017.0       3
1990.0       2
2020.0       1
Name: Year, dtype: int64

## Genre vs Sales

In [93]:
pltx.scatter(data, 'Genre', 'Global_Sales')

In [82]:
data['Genre'].value_counts()

Action          2881
Sports          2042
Misc            1536
Role-Playing    1281
Adventure       1238
Racing          1069
Shooter         1055
Simulation       769
Fighting         732
Platform         699
Strategy         642
Puzzle           519
Name: Genre, dtype: int64

## Publisher vs Sales

In [94]:
pltx.scatter(data, 'Publisher', 'Global_Sales')

In [84]:
data['Publisher'].value_counts()

Electronic Arts                 1043
Namco Bandai Games               885
Activision                       822
Ubisoft                          820
Konami Digital Entertainment     768
                                ... 
On Demand                          1
Monte Christo Multimedia           1
Pony Canyon                        1
Riverhillsoft                      1
UIG Entertainment                  1
Name: Publisher, Length: 568, dtype: int64

## Year vs Platform

In [95]:
pltx.scatter(data, 'Year', 'Platform', width = 800, height = 800)

## Year vs Publisher

In [96]:
pltx.scatter(data, 'Year', 'Publisher', width = 800, height = 800)

## Year vs Genre

In [97]:
pltx.scatter(data, 'Year', 'Genre', width = 800, height = 800)

## Platform vs Publisher

In [98]:
pltx.scatter(data, 'Platform', 'Publisher', width = 1000, height = 1000)

## Platform vs Genre

In [99]:
pltx.scatter(data, 'Platform', 'Genre', width = 1000, height = 1000)

## Publisher vs Genre

In [100]:
pltx.scatter(data, 'Publisher', 'Genre', width = 1000, height = 1000)