## Import Everything we need

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df=pd.read_csv("vgsales.csv")

In [3]:
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [16]:
df['Genre'].unique()

array(['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Misc',
       'Shooter', 'Simulation', 'Action', 'Fighting', 'Adventure',
       'Strategy'], dtype=object)

In [8]:
df.index

RangeIndex(start=0, stop=16598, step=1)

In [9]:
len(df)

16598

In [10]:
df.columns

Index(['Rank', 'Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'],
      dtype='object')

## feature Engineering is a process of converting raw string based(type) data into numerical type data

## for example cateorgical data like marks, diabetes tye etc

# LabelEncoding

In [11]:
from sklearn.preprocessing import LabelEncoder
leo=LabelEncoder()

In [12]:
genre_label=leo.fit_transform(df['Genre'])
genre_label

array([10,  4,  6, ...,  6,  5,  4])

In [13]:
en=leo.classes_
en

array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',
       'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',
       'Strategy'], dtype=object)

#### Dictionary Comprehension

In [14]:
genre_map={index:label for index ,label in enumerate(en)}
genre_map

{0: 'Action',
 1: 'Adventure',
 2: 'Fighting',
 3: 'Misc',
 4: 'Platform',
 5: 'Puzzle',
 6: 'Racing',
 7: 'Role-Playing',
 8: 'Shooter',
 9: 'Simulation',
 10: 'Sports',
 11: 'Strategy'}

## Putting new genre_label into dataframe(df)

In [15]:
df['GenreLabel']=genre_label
df[['Name','Year','Genre','GenreLabel']].head()

Unnamed: 0,Name,Year,Genre,GenreLabel
0,Wii Sports,2006.0,Sports,10
1,Super Mario Bros.,1985.0,Platform,4
2,Mario Kart Wii,2008.0,Racing,6
3,Wii Sports Resort,2009.0,Sports,10
4,Pokemon Red/Pokemon Blue,1996.0,Role-Playing,7


In [17]:
df.drop('Genre',inplace=True,axis=1)

In [18]:
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,GenreLabel
0,1,Wii Sports,Wii,2006.0,Nintendo,41.49,29.02,3.77,8.46,82.74,10
1,2,Super Mario Bros.,NES,1985.0,Nintendo,29.08,3.58,6.81,0.77,40.24,4
2,3,Mario Kart Wii,Wii,2008.0,Nintendo,15.85,12.88,3.79,3.31,35.82,6
3,4,Wii Sports Resort,Wii,2009.0,Nintendo,15.75,11.01,3.28,2.96,33.0,10
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Nintendo,11.27,8.89,10.22,1.0,31.37,7


## Feature engineering Ordinal Dimensions in datasets

Ordinal dimensions of data are those dimension whoes order of apperance is significant for their existence in terms of their values

In [20]:
poke=pd.read_csv('Pokemon.csv')

In [21]:
poke=poke.sample(random_state=1,frac=1).reset_index(drop=True)
poke.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,6,CharizardMega Charizard Y,Fire,Flying,634,78,104,78,159,115,100,Gen 1,False
1,460,Abomasnow,Grass,Ice,494,90,92,75,92,85,60,Gen 4,False
2,161,Sentret,Normal,,215,35,46,34,35,45,20,Gen 2,False
3,667,Litleo,Fire,Normal,369,62,50,58,73,54,72,Gen 6,False
4,224,Octillery,Water,,480,75,105,75,105,75,45,Gen 2,False


In [22]:
poke['Generation'].unique()

array(['Gen 1', 'Gen 4', 'Gen 2', 'Gen 6', 'Gen 3', 'Gen 5'], dtype=object)

In [24]:
gen_ordinal_map={
    'Gen 1':1, 'Gen 4':4, 'Gen 2':2, 'Gen 6':6, 'Gen 3':3, 'Gen 5':5,
}
poke['GenerationLabel']=poke['Generation'].map(gen_ordinal_map)
poke[['Name','Type 1','GenerationLabel','Generation','Total']].head()

Unnamed: 0,Name,Type 1,GenerationLabel,Generation,Total
0,CharizardMega Charizard Y,Fire,1,Gen 1,634
1,Abomasnow,Grass,4,Gen 4,494
2,Sentret,Normal,2,Gen 2,215
3,Litleo,Fire,6,Gen 6,369
4,Octillery,Water,2,Gen 2,480


## One Hot Level Encoder

In [34]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

In [35]:
poke[['Name','Generation','Legendary']].head()

Unnamed: 0,Name,Generation,Legendary
0,CharizardMega Charizard Y,Gen 1,False
1,Abomasnow,Gen 4,False
2,Sentret,Gen 2,False
3,Litleo,Gen 6,False
4,Octillery,Gen 2,False


In [36]:
#transform and map pokemon generation
gen_le=LabelEncoder()
gen_labels=gen_le.fit_transform(poke['Generation'])
poke['Gen_Label']=gen_labels

In [40]:
#Tranform and map pokemon legendary status
en_le=LabelEncoder()
gen_labels=en_le.fit_transform(poke['Legendary'])
poke['Lgnd_Label']=gen_labels

#encode genreation labels using one hot encoding schemes
gen_ohe = OneHotEncoder()
gen_feature_arr = gen_ohe.fit_transform(poke[['Gen_Label']]).toarray()
gen_feature_labels = list(gen_le.classes_)
gen_feature_labels



In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


['Gen 1', 'Gen 2', 'Gen 3', 'Gen 4', 'Gen 5', 'Gen 6']

In [41]:
gen_features = pd.DataFrame(gen_feature_arr, columns=gen_feature_labels)

In [42]:
gen_features

Unnamed: 0,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5,Gen 6
0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,1.0
6,0.0,0.0,0.0,1.0,0.0,0.0
7,0.0,0.0,1.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,1.0,0.0
