In [None]:
# 1. Install and fetch the dataset from Kaggle
!pip install opendatasets --quiet

import opendatasets as od
od.download('https://www.kaggle.com/datasets/gregorut/videogamesales')

import pandas as pd
from sklearn.model_selection import train_test_split

# 2. Load the dataset
data = pd.read_csv(r'data/vgsales.csv')
print("Dataset shape:", data.shape)
print(data.head())

# 3. Basic EDA
print("\nColumns and Data Types:")
print(data.dtypes)

print("\nMissing values per column:")
print(data.isna().sum())

# 4. Data Cleaning
# Convert Year to numeric and drop rows with missing year
data['Year'] = pd.to_numeric(data['Year'], errors='coerce')
data = data.dropna(subset=['Year'])

# 5. Feature Engineering
# Compute share of global sales by region
for region in ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']:
    data[f'{region}_pct'] = data[region] / data['Global_Sales']

# 6. Prepare data for modeling example:
# Let's say we want to predict Global_Sales based on numeric features
features = ['Year', 'NA_Sales_pct', 'EU_Sales_pct', 'JP_Sales_pct', 'Other_Sales_pct']
X = data[features].fillna(0)  # fill any remaining NaNs
y = data['Global_Sales']

# 7. Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("\nTraining features preview:")
print(X_train.head())

print("\nTraining targets preview:")
y_train.head()


Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:


[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
