In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)

In [None]:
# Use your local path or adjust if loading from data/ folder
df = pd.read_csv('../data/steam_games.csv', low_memory=False)

# Check shape and columns
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
df.head()

In [None]:
# Check nulls and datatypes
df.info()
df.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
# Describe numerical columns
df.describe()

In [None]:
# Top frequent values in object columns
for col in df.select_dtypes(include='object').columns:
    print(f"\n{col} — Unique: {df[col].nunique()}")
    print(df[col].value_counts().head(5))

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(df['price'], bins=50, kde=True)
plt.title('Distribution of Game Prices')
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.show()

In [None]:
missing = df.isnull().mean() * 100
missing[missing > 0].sort_values(ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='price', y='recommendations', alpha=0.5)
plt.xscale('log')
plt.yscale('log')
plt.title('Price vs. Recommendations')
plt.xlabel('Price ($)')
plt.ylabel('Number of Recommendations')
plt.grid(True)
plt.show()

In [9]:
# Convert release_date to datetime
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = df['release_date'].dt.year

In [None]:
games_per_year = df['release_year'].value_counts().sort_index()

plt.figure(figsize=(12, 5))
sns.lineplot(x=games_per_year.index, y=games_per_year.values)
plt.title('Number of Games Released per Year')
plt.xlabel('Year')
plt.ylabel('Number of Games')
plt.grid(True)
plt.show()

In [None]:
median_price_by_year = df.groupby('release_year')['price'].median()

plt.figure(figsize=(12, 5))
sns.lineplot(x=median_price_by_year.index, y=median_price_by_year.values)
plt.title('Median Game Price by Release Year')
plt.xlabel('Year')
plt.ylabel('Median Price ($)')
plt.grid(True)
plt.show()

In [None]:
from collections import Counter

genre_counts = Counter()
df['genres'].dropna().apply(lambda x: genre_counts.update(x.split(';')))

# Top 10 genres
top_genres = pd.DataFrame(genre_counts.most_common(10), columns=['Genre', 'Count'])

plt.figure(figsize=(10, 5))
sns.barplot(data=top_genres, x='Count', y='Genre', palette='viridis')
plt.title('Top 10 Genres')
plt.xlabel('Number of Games')
plt.ylabel('Genre')
plt.grid(True)
plt.show()

In [None]:
columns_to_drop = [
    'detailed_description', 'about_the_game', 'short_description',
    'header_image', 'website', 'support_url', 'support_email',
    'notes', 'full_audio_languages', 'movies', 'screenshots',
    'metacritic_url', 'packages', 'tags'
]

df.drop(columns=columns_to_drop, axis=1, inplace=True)
print(f"Remaining columns: {df.shape[1]}")


In [6]:
def price_bucket(price):
    if price == 0:
        return 'Free'
    elif price <= 5:
        return '$0 - $5'
    elif price <= 15:
        return '$5 - $15'
    elif price <= 30:
        return '$15 - $30'
    else:
        return '$30+'

df['price_bucket'] = df['price'].apply(price_bucket)


In [7]:
def get_platform(row):
    platforms = []
    if row['windows']: platforms.append('Windows')
    if row['mac']: platforms.append('Mac')
    if row['linux']: platforms.append('Linux')
    return ', '.join(platforms)

df['platforms'] = df.apply(get_platform, axis=1)


In [None]:
# Nulls after cleanup
df.isnull().mean().sort_values(ascending=False).head(10)

In [9]:
df.to_csv('../data/steam_games_cleaned.csv', index=False)