In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns

In [22]:
df = pd.read_csv('/content/vgsales.csv')
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,259,Asteroids,2600,1980.0,Shooter,Atari,4.0,0.26,0.0,0.05,4.31
1,545,Missile Command,2600,1980.0,Shooter,Atari,2.56,0.17,0.0,0.03,2.76
2,1768,Kaboom!,2600,1980.0,Misc,Activision,1.07,0.07,0.0,0.01,1.15
3,1971,Defender,2600,1980.0,Misc,Atari,0.99,0.05,0.0,0.01,1.05
4,2671,Boxing,2600,1980.0,Fighting,Activision,0.72,0.04,0.0,0.01,0.77


In [23]:
columns = list(df.columns)
columns

['Rank',
 'Name',
 'Platform',
 'Year',
 'Genre',
 'Publisher',
 'NA_Sales',
 'EU_Sales',
 'JP_Sales',
 'Other_Sales',
 'Global_Sales']

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16324 entries, 0 to 16323
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16324 non-null  int64  
 1   Name          16324 non-null  object 
 2   Platform      16324 non-null  object 
 3   Year          15968 non-null  float64
 4   Genre         16220 non-null  object 
 5   Publisher     16288 non-null  object 
 6   NA_Sales      16324 non-null  float64
 7   EU_Sales      16324 non-null  float64
 8   JP_Sales      16324 non-null  float64
 9   Other_Sales   16324 non-null  float64
 10  Global_Sales  16324 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB


In [25]:
df.shape

(16324, 11)

Checking for Duplicates

In [26]:
print(df.duplicated().sum())

0


Checking for null values

In [27]:
print(df.isnull().sum())

Rank              0
Name              0
Platform          0
Year            356
Genre           104
Publisher        36
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64


In [28]:
platform_counts = df['Platform'].value_counts()
print(platform_counts)


DS      2133
PS2     2127
PS3     1304
Wii     1290
X360    1235
PSP     1197
PS      1189
PC       943
GBA      811
XB       803
GC       542
3DS      500
PSV      410
PS4      335
N64      316
SNES     239
XOne     213
SAT      173
WiiU     143
2600     116
NES       98
GB        97
DC        52
GEN       27
NG        12
WS         6
SCD        6
3DO        3
TG16       2
GG         1
PCFX       1
Name: Platform, dtype: int64


In [32]:
# Replacing missing value with most frequently used name
most_frequent_platform = 'DS'
df['Platform'].fillna(value=most_frequent_platform, inplace=True)

In [33]:
# Replacing missing value in NA_Sales, EU_Sales, JP_Sales, Other_Sales, Global_Sales by calculating mean
sales_columns = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']
df[sales_columns] = df[sales_columns].fillna(df[sales_columns].mean())

In [35]:
# Filling missing values in the 'Year' column with the mean
df['Year'].fillna(value=df['Year'].mean(), inplace=True)

# Filling missing values in the 'Genre' and 'Publisher' columns with the most frequent values
most_frequent_genre = df['Genre'].mode()[0]
most_frequent_publisher = df['Publisher'].mode()[0]

df['Genre'].fillna(value=most_frequent_genre, inplace=True)
df['Publisher'].fillna(value=most_frequent_publisher, inplace=True)

# Verify the missing values have been filled
print(df.isnull().sum())


Rank            0
Name            0
Platform        0
Year            0
Genre           0
Publisher       0
NA_Sales        0
EU_Sales        0
JP_Sales        0
Other_Sales     0
Global_Sales    0
dtype: int64


In [40]:
categorical_cols = ['Platform', 'Genre']

# Initialize a dictionary to store the mappings
encoding_mappings = {}

# Apply Label Encoding to each categorical column and store the mappings
for col in categorical_cols:
    label_encoder = LabelEncoder()
    df[col] = label_encoder.fit_transform(df[col])
    encoding_mappings[col] = {label: encoded for label, encoded in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))}

# Concatenate the label encoded columns with the original DataFrame
df = pd.concat([df.drop(columns=categorical_cols), df[categorical_cols]], axis=1)

# Print the encoding mappings
print("Encoding Mappings:")
for col, mapping in encoding_mappings.items():
    print(f"{col}: {mapping}")
    print()

Encoding Mappings:
Platform: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30}

Genre: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11}

