# Data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/videogamesales/vgsales.csv')

In [None]:
# EDA

In [None]:
def eda(dfA, all=False, desc='Exploratory Data Analysis'):
    print(desc)
    print(f'\nShape:\n{dfA.shape}')
    print(f'\nDTypes - Numerics')
    print(dfA.select_dtypes(include=np.number).columns.tolist())
    print(f'\nDTypes - Not Numerics')
    print(dfA.select_dtypes(include='object').columns.tolist())
    print(f'\nIs Null: {dfA.isnull().sum().sum()}')
    print(f'{dfA.isnull().mean().sort_values(ascending=False)}')
    dup = dfA.duplicated()
    print(f'\nDuplicated: \n{dfA[dup].shape}\n')
    try:
        print(dfA[dfA.duplicated(keep=False)].sample(4))
    except:
        pass
    if all:  # here you put yours prefered analysis that detail more your dataset
        
        print(f'\nDTypes - Numerics')
        print(dfA.describe(include=[np.number]))
        print(f'\nDTypes - Categoricals')
        print(dfA.describe(include=['object']))


In [None]:
eda(df)

More about NA values

In [None]:
df.sort_values(by=['Year'])['Year'].unique()

In [None]:
df[df.Year.isna()]

In [None]:
df.Publisher.unique()

In [None]:
# function Fill NaN values
def fillNaN(dfA):
  for col in dfA:
    if type(dfA[col]) == 'object':
        dfA[col] = dfA[col].fillna('Unknown')
    else:
        dfA[col] = dfA[col].fillna(0)
  return dfA

In [None]:
dfNew = fillNaN(df)
eda(dfNew)

# More about all variables

**IDzation in Categoricals Variables**

In [None]:
dfNew.describe()

In [None]:
dfNew.info()

Changing Object Variables to Numeric Variables

In [None]:
dfNew.select_dtypes(include='object').columns.tolist()

In [None]:
# changing Categoricals to number
def catToNumeric(dfA):
    for x in dfA.select_dtypes(include='object').columns.tolist():
        ncol = 'id_'+x 
        dfA[x] = pd.Categorical(dfA[x])
        dfA[ncol] = dfA[x].cat.codes
    return dfA

In [None]:
dfNew = catToNumeric(dfNew)

In [None]:
dfNew.select_dtypes(include=np.number).columns.tolist()

**Data Statistics - Variables**

In [None]:
import plotly.graph_objects as go

In [None]:
# ignoring Year 0 [values filled by function]
salesYear = dfNew[((dfNew.Year>0) & (dfNew.Year < 2016))][['Global_Sales', 'Year']].groupby('Year').sum()


In [None]:
fig = go.Figure([go.Bar(x=salesYear.index.tolist(), y=salesYear.values.reshape(-1).tolist())])
fig.update_layout(template='plotly_white', title="Games - Sales History (US$ mi)")
fig.show()

In [None]:
salesPlatform = dfNew.query('2016 > Year >=1980')[['Platform', 'Global_Sales']].groupby(['Platform']).sum().sort_values(by='Global_Sales', ascending=False).head(10)

fig = go.Figure([go.Bar(x=salesPlatform.index.tolist(), y=salesPlatform.values.reshape(-1).tolist())])
fig.update_layout(template='plotly_white', title="Games - Top 10 Platforms (US$ mi)")
fig.show()

In [None]:
salesGenre = dfNew.query('2016 > Year >=1980')[['Genre', 'Global_Sales']].groupby(['Genre']).sum().sort_values(by='Global_Sales', ascending=False).head(10)

fig = go.Figure([go.Bar(x=salesGenre.index.tolist(), y=salesGenre.values.reshape(-1).tolist())])
fig.update_layout(template='plotly_white', title="Games - Top 10 Genres (US$ mi)")
fig.show()

In [None]:
salesNames = dfNew.query('2016 > Year >=1980')[['Name', 'Global_Sales']].groupby(['Name']).sum().sort_values(by='Global_Sales', ascending=False).head(20)

x=salesNames.index.tolist()
y=salesNames.values.reshape(-1).tolist()
fig = go.Figure([go.Bar(x=x, y=y, marker=dict(color=y, colorscale='sunsetdark'))])
fig.update_layout(template='plotly_white', title="Games - Top 20 (US$ mi)")
fig.show()

In [None]:
salesW = dfNew[['NA_Sales','EU_Sales','JP_Sales','Other_Sales']].sum()
fig = go.Figure(data=[go.Pie(labels=salesW.index, values=salesW.values)])
fig.update_layout(template='plotly_white', title="Games - World (US$ mi)")
fig.show()

# NLP - Spacy

In [None]:
# sentiment analyzer
!pip install vaderSentiment
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# nlp
import spacy
!pip install -U spacy
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS as stopWordsEn

# dataviz
from IPython.display import SVG, display

In [None]:
nlp = spacy.load("en_core_web_sm")

**Testing NLP - Dataviz and Sentiment Analyzer**

In [None]:
def showSVG(s):
  display(SVG(s))

In [None]:
ex1 = "I love playing FreeFire."
doc = nlp(ex1)
graph01 = displacy.render(doc, options={'compact': True, "bg": "#09a3d5", "color": "white"})
showSVG(graph01)


In [None]:
readSentiment = SentimentIntensityAnalyzer()
sentiments = ['Negative', 'Positive', 'Neutral']
def getSentiment(phrase):
  s = readSentiment.polarity_scores(phrase)
  if s['compound'] <= -0.05:
    sentiment = 0
  elif s['compound'] >= 0.05:
    sentiment = 1
  else:
    sentiment = 2
  return sentiment, s

In [None]:
getSentiment(ex1)

In [None]:
f'{sentiments[getSentiment(ex1)[0]]}: {ex1}'

**Using games data**

In [None]:
games = dfNew.Name.unique().tolist()

In [None]:
# testing first records
[f'{sentiments[getSentiment(txt)[0]]}: {txt}'  for txt in games[:5]]    

get sentiment

In [None]:
gameSent = []
for g in games:
    gameSent.append(getSentiment(g)[0])

counting sentiment

In [None]:
values = [gameSent.count(v) for v in range(0,3) ]

dataviz

In [None]:
fig = go.Figure([go.Bar(x=sentiments, y=values, marker=dict(color=y, colorscale='oryel'))])
fig.update_layout(template='plotly_white', title="Games - Title's Sentiment")
fig.show()

**Words analytics**

In [None]:
import string
def tokenizeStr(original):
  txt2 = nlp(original) # créer une liste de mots
  txt2 = [str(token.lemma_).upper() for token in txt2 if not nlp.vocab[token.text].is_stop]
  punct = string.punctuation
  stopwords = list(stopWordsEn)
  ws = string.whitespace
  txt2 = [word for word in txt2 if word not in stopwords and word not in punct and word not in ws  if len(word)]
  return txt2

In [None]:
import re
def cleaningText(original, show=False):
  txt = original.upper() # uppercase
  txt = re.sub('@','',txt) # remove @ 
  txt = re.sub('/',' ',txt) # remove @ 
  txt = re.sub(r'[^a-zA-Z ]+', '', txt) # remove numbers
  txt = re.sub('\[.*\]','',txt) # remove contents between brackets
  txt = re.sub('<.*?>+','',txt) # remove contents between less and more signs
  txt = re.sub('https?://\S+|www\.\S+', '', txt) # remove URLs
  txt = re.sub(re.escape(string.punctuation), '', txt) # remove punctuation
  txt = re.sub('\n', '', txt) # remove line break
  txt = str(txt).strip()
  if show:
    print('ORIGINAL: ', original)
    print('   TEXT CLEANNED: ', txt)
  return txt


In [None]:
wordsT = []
for row in games:
    word = cleaningText(row)
    wordsT.append(tokenizeStr(word))

In [None]:
from itertools import chain
wordsT2=list(chain.from_iterable(wordsT))
wordlist = pd.value_counts(wordsT2)
topW = pd.DataFrame(data={'tag': wordlist.index, 'count':wordlist.values})

In [None]:
x = topW['tag'][:10].str.upper()
y = topW['count'][:10]
fig = go.Figure([go.Bar(x=y, y=x, marker=dict(color=y, colorscale='oryel'), 
                       orientation='h')])
fig.update_layout(template='plotly_white', title="Games - Top 10 tags name")
fig.show()

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS as stc
stopwordsCloud = set(stc)

In [None]:
wordcloud = WordCloud(width = 800, height = 600, 
                background_color ='white', 
                stopwords = stopwordsCloud, 
                min_font_size = 10).generate(' '.join(wordsT2)) 
  
# plot the WordCloud image                        
plt.figure(figsize = (10, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.title('Map of Words - Games')
  
plt.show() 