ARTI308 - Machine Learning


## Defining the Machine Learning Problem

This is a sales analysis problem.
The dataset contains video game sales features such as platform, genre, publisher, year, and regional sales.

The goal is to identify sales patterns and the factors associated with high global sales.


In [None]:
# Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Make plots look cleaner
sns.set()


In [None]:
# Load Dataset

df = pd.read_csv('vgsales.csv')

# Display first 5 rows
df.head()


### Check Missing Values


In [None]:
# Check all values

df.isna().sum()


In [None]:
print(df.isna().sum())


### Check duplicate rows


In [None]:
# checking duplicate rows

df.duplicated().sum()


### No. of rows and columns


In [None]:
# finding number of rows and columns

df.shape


### Data type of columns


In [None]:
# viewing the data types of columns

df.dtypes


In [None]:
# cleaning key columns

df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
df['Publisher'] = df['Publisher'].fillna('Unknown')

# confirm data types
print(df.dtypes)


### Statistical summary


In [None]:
# Statistical summary

df.describe()


### Distribution of Global Sales


In [None]:
plt.figure(figsize=(8,5))
sns.histplot(data=df, x='Global_Sales', bins=30, kde=True, color='steelblue')
plt.title('Distribution of Global Sales (X-axis adjusted)')
plt.xlabel('Global Sales (millions)')
plt.ylabel('Frequency')
plt.xlim(0, df['Global_Sales'].quantile(0.99))
plt.show()


### Distribution of NA Sales


In [None]:
plt.figure(figsize=(8,5))
sns.histplot(data=df, x='NA_Sales', bins=30, kde=True, color='darkorange')
plt.title('Distribution of NA Sales (X-axis adjusted)')
plt.xlabel('NA Sales (millions)')
plt.ylabel('Frequency')
plt.xlim(0, df['NA_Sales'].quantile(0.99))
plt.show()


### Global Sales by Platform


In [None]:
platform_sales = df.groupby('Platform')['Global_Sales'].sum().sort_values(ascending=False)
platform_sales.head(10)


### Global Sales by Genre


In [None]:
genre_sales = df.groupby('Genre')['Global_Sales'].sum().sort_values(ascending=False)
genre_sales


### Global Sales by Publisher


In [None]:
publisher_sales = df.groupby('Publisher')['Global_Sales'].sum().sort_values(ascending=False)
publisher_sales.head(10)


### NA Sales vs Global Sales


In [None]:
plt.figure(figsize=(8,5))
sns.scatterplot(x='NA_Sales', y='Global_Sales', data=df)
plt.title('NA Sales vs Global Sales')
plt.xlabel('NA Sales')
plt.ylabel('Global Sales')
plt.show()


### Correlation Heatmap


In [None]:
sales_columns = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']

plt.figure(figsize=(6,4))
sns.heatmap(df[sales_columns].corr(), annot=True)
plt.title('Correlation Heatmap')
plt.show()


### Yearly Global Sales Trend


In [None]:
df['Year_Int'] = df['Year'].round().astype('Int64')

yearly_sales = df.dropna(subset=['Year_Int']).groupby('Year_Int')['Global_Sales'].sum()

plt.figure(figsize=(10,5))
plt.plot(yearly_sales.index.astype(int), yearly_sales.values, marker='o')
plt.title('Yearly Global Sales Trend')
plt.xlabel('Year')
plt.ylabel('Global Sales (millions)')
plt.show()


### Key Findings


In [None]:
print('Top Platform by Global Sales:', platform_sales.idxmax())
print('Top Genre by Global Sales:', genre_sales.idxmax())
print('Top Publisher by Global Sales:', publisher_sales.idxmax())
print('Total Global Sales:', round(df['Global_Sales'].sum(), 2), 'million units')
