<a href="https://colab.research.google.com/github/Wasian98/MATH-120-Final-Project/blob/main/MATH_120_Final_Project_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MATH 120 Final Project: Game Steam Stats for March 2025

#Environment Setup

Run the cell below to set up the environment for either Google Colab or local execution:

In [86]:
import os
import sys

# Check if running in Google Colab
try:
    import google.colab
    IN_COLAB = True
    print("Running in Google Colab")

    # Clone repository if in Colab
    if not os.path.exists('/content/MATH-120-Final-Project/'):
        !git clone https://github.com/Wasian98/MATH-120-Final-Project.git

    # Change to project directory
    os.chdir('/content/MATH-120-Final-Project')

except ImportError:
    IN_COLAB = False
    print("Running locally")

# Add src directory to Python path
if 'src' not in sys.path:
    sys.path.append('src')

print(f"Current working directory: {os.getcwd()}")

Running in Google Colab
Current working directory: /content/MATH-120-Final-Project


#Import Libraries

In [87]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from data_processing import load_raw_data, clean_data, save_cleaned_data
from analysis import meta_genre_check, genre_meta_check


ModuleNotFoundError: No module named 'analysis'

#Data Loading and Cleaning

In [None]:
# Load raw data
steam_raw = load_raw_data('/content/MATH-120-Final-Project/data_raw/games_march2025_full.csv')
print("\nFirst few rows of Steam data:")
print(steam_raw.head())


First few rows of Steam data:
    appid                             name release_date  required_age  price  \
0     730                 Counter-Strike 2   2012-08-21             0   0.00   
1  578080              PUBG: BATTLEGROUNDS   2017-12-21             0   0.00   
2     570                           Dota 2   2013-07-09             0   0.00   
3  271590        Grand Theft Auto V Legacy   2015-04-13            17   0.00   
4  488824  Tom Clancy's Rainbow Six® Siege   2015-12-01            17  19.99   

   dlc_count                               detailed_description  \
0          1  For over two decades, Counter-Strike has offer...   
1          0  LAND, LOOT, SURVIVE! Play PUBG: BATTLEGROUNDS ...   
2          2  The most-played game on Steam. Every day, mill...   
3          0  When a young street hustler, a retired bank ro...   
4          9  Edition Comparison Ultimate Edition The Tom Cl...   

                                      about_the_game  \
0  For over two decades, Coun

In [79]:
# Clean data
steam_clean = clean_data(steam_raw)

print("Cleaned and data shape:", steam_clean.shape)
print("\nData info:")
print(steam_clean.info())

Cleaned and data shape: (94948, 10)

Data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94948 entries, 0 to 94947
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      94946 non-null  object 
 1   release_date              94948 non-null  object 
 2   price                     94948 non-null  float64
 3   metacritic_score          94948 non-null  int64  
 4   genres                    94948 non-null  object 
 5   user_score                94948 non-null  int64  
 6   estimated_owners          94948 non-null  object 
 7   average_playtime_forever  94948 non-null  int64  
 8   average_playtime_2weeks   94948 non-null  int64  
 9   num_reviews_total         94948 non-null  int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 7.2+ MB
None


In [80]:
# Save cleaned data
save_cleaned_data(steam_clean, 'data/processed_steam_data.csv')
print("Cleaned data saved to data/processed_steam_data.csv")

Cleaned data saved to data/processed_steam_data.csv


#Data Analysis

In [81]:
steam_clean.groupby("genres").size()


Unnamed: 0_level_0,0
genres,Unnamed: 1_level_1
"['Accounting', 'Animation & Modeling', 'Audio Production', 'Design & Illustration', 'Education', 'Photo Editing', 'Software Training', 'Utilities', 'Video Production', 'Web Publishing', 'Game Development']",2
"['Accounting', 'Animation & Modeling', 'Audio Production', 'Design & Illustration', 'Education', 'Photo Editing', 'Software Training', 'Utilities', 'Video Production', 'Web Publishing']",2
"['Accounting', 'Design & Illustration', 'Education', 'Utilities', 'Web Publishing', 'Game Development']",1
"['Accounting', 'Education', 'Software Training', 'Utilities', 'Early Access']",1
"['Accounting', 'Utilities']",1
...,...
"['Violent', 'Simulation', 'Early Access']",1
"['Violent', 'Sports']",1
['Violent'],1
['Web Publishing'],2


In [84]:
op = (
    steam_clean
    .groupby("genres")
    .agg(
        genre_count=("genres", "size"),
        meta_score=("metacritic_score", "mean"),
    )
    .query("genre_count > 10 and meta_score > 0")
    .sort_values(by="genre_count", ascending=False)
)

print(op)

                                                    genre_count  meta_score
genres                                                                     
[]                                                         5441    0.092446
['Casual', 'Indie']                                        5100    0.401373
['Action', 'Indie']                                        4695    3.943131
['Action', 'Adventure', 'Indie']                           4107    5.012174
['Adventure', 'Indie']                                     3436    7.213329
...                                                         ...         ...
['Free To Play', 'Indie', 'Massively Multiplaye...           12    5.250000
['Action', 'Adventure', 'Indie', 'Massively Mul...           11   15.272727
['Action', 'RPG', 'Simulation', 'Strategy']                  11   13.272727
['Massively Multiplayer', 'Strategy', 'Free To ...           11    6.818182
['Racing', 'Simulation', 'Sports', 'Strategy']               11   13.909091

[192 rows x

In [83]:
po = (
    steam_clean
    .groupby("genres")
    .agg(
        genre_count=("genres", "size"),
        meta_score=("metacritic_score", "mean"),
    )
    .query("genre_count > 10 and meta_score > 0")
    .sort_values(by="meta_score", ascending=False)
)

print(po)

                                                    genre_count  meta_score
genres                                                                     
['Action', 'RPG']                                           251   20.549801
['Action', 'RPG', 'Strategy']                                41   18.390244
['Action', 'Indie', 'RPG', 'Simulation', 'Strat...           21   17.238095
['RPG', 'Simulation', 'Strategy']                            64   16.312500
['Action', 'RPG', 'Simulation']                              18   16.277778
...                                                         ...         ...
['Adventure', 'Casual', 'Simulation']                       221    0.339367
['Casual', 'Indie', 'RPG', 'Simulation']                    247    0.315789
['Casual']                                                 2591    0.298726
['Casual', 'Indie', 'RPG']                                  299    0.264214
[]                                                         5441    0.092446

[192 rows x

#Data Visualization

#Conclusions