In [11]:
""" Import Statements """
import pandas as pd
import numpy as np
import json
import pymongo

In [13]:
def parse_relevant_data(data):
    """
        Parses the relevant data from the JSON file and returns a list of dictionaries.
        
        Parameters:
        data (dict): The JSON data loaded into a dictionary.
        
        Returns:
        list: A list of dictionaries containing the relevant data.
    """    
    relevant_data = []
    
    for game in data:
        game_data = data[game]
        relevant_data.append({
            'name': game_data['name'],
            'release_date': game_data['release_date'],
            'required_age': game_data['required_age'],
            'price': game_data['price'],
            'dlc_count': game_data['dlc_count'],
            'detailed_description': game_data['detailed_description'],
            'about_the_game': game_data['about_the_game'],
            'short_description': game_data['short_description'],
            'windows': game_data['windows'],
            'mac': game_data['mac'],
            'linux': game_data['linux'],
            'metacritic_score': game_data['metacritic_score'],
            'achievements': game_data['achievements'],
            'recommendations': game_data['recommendations'],
            'supported_languages': game_data['supported_languages'], 
            'full_audio_languages': game_data['full_audio_languages'], 
            'developers': game_data['developers'], 
            'publishers': game_data['publishers'], 
            'categories': game_data['categories'], 
            'genres': game_data['genres'], 
            'tags': game_data['tags'], 
            'user_score': game_data['user_score'],
            'score_rank': game_data['score_rank'],
            'positive_reviews': game_data['positive'],
            'negative_reviews': game_data['negative'],
            'estimated_owners': game_data['estimated_owners'],
            'average_playtime_forever': game_data['average_playtime_forever'],
            'average_playtime_2weeks': game_data['average_playtime_2weeks'],
            'median_playtime_forever': game_data['median_playtime_forever'],
            'median_playtime_2weeks': game_data['median_playtime_2weeks'],
            'peak_ccu': game_data['peak_ccu'],
        })
        
    return relevant_data

In [None]:
def load_dataset_into_mongodb(json_file, client_url, db_name, collection_name):
    """
        Loads the dataset from the json file and stores it into a MongoDB database.
        
        Parameters:
        json_file (str): Path to the JSON file.
        client_url (str): MongoDB client URL.
        db_name (str): Name of the database.
        collection_name (str): Name of the collection.
        
    """

    # Open the JSON file and load the data into a dictionary
    with open(json_file, encoding="utf8") as data_file:    
        data_dict = json.load(data_file)
    
    # Parse the relevant data from the dictionary
    relevant_data = parse_relevant_data(data_dict)
    
    # Create a local database
    client = pymongo.MongoClient(client_url)
    db = client[db_name]
    collection = db[collection_name]
    
    print("Database Names: ", client.list_database_names())
    print("games_database collection Names: ", db.list_collection_names())
    
    # If the database exists, create it. Otherwise, leave it alone    
    if collection.count_documents({}) > 0:
        print("The collection already has data.")
    else:
        collection.insert_many(relevant_data)
        print("The collection was populated with data.")
    
    client.close()

In [None]:
def pull_dataframe_from_mongodb(client_url, db_name, collection_name):
    """
    Pulls the collection from the MongoDB database and returns it as a DataFrame.

    Args:
        client_url (string): The URL of the MongoDB client
        db_name (string): The name of the database
        collection_name (string): The name of the collection

    Returns:
        _type_: _description_
    """
    # Access the local database
    client = pymongo.MongoClient(client_url)
    db = client[db_name]
    collection = db[collection_name]
    
    # If the database exists, pull the collection. Otherwise, leave it alone
    if collection_name in db.list_collection_names():
        print("The collection exists for retrieving data.")
        collection_data = collection.find()
    else:
        print("The collection does not exist.")
        collection_data = None
    
    # Convert the collection data to a DataFrame
    df = pd.DataFrame(list(collection_data))
    
    # Drop the '_id' column if it exists
    if '_id' in df.columns:
        df.drop(columns=['_id'], inplace=True)
    
    client.close()
    
    return df

In [16]:
# Main function to load the dataset into MongoDB and pull it as a DataFrame

DATA_FILE = "games.json"
CLIENT_URL = "mongodb://localhost:27017/"
DB_NAME = "games_database"
COLLECTION_NAME = "games"

# Load the dataset into MongoDB
load_dataset_into_mongodb(DATA_FILE, CLIENT_URL, DB_NAME, COLLECTION_NAME)

data = pull_dataframe_from_mongodb(CLIENT_URL, DB_NAME, COLLECTION_NAME)

# Process Data Here
print("\n", data.head())

Database Names:  ['admin', 'config', 'games_database', 'local']
games_database collection Names:  ['games']
Documents in collection:  97410
The collection already has data.
The collection exists.

                     name  release_date  required_age  price  dlc_count  \
0       Galactic Bowling  Oct 21, 2008             0  19.99          0   
1           Train Bandit  Oct 12, 2017             0   0.99          0   
2           Jolt Project  Nov 17, 2021             0   4.99          0   
3               Henosis™  Jul 23, 2020             0   5.99          0   
4  Two Weeks in Painland   Feb 3, 2020             0   0.00          0   

                                detailed_description  \
0  Galactic Bowling is an exaggerated and stylize...   
1  THE LAW!! Looks to be a showdown atop a train....   
2  Jolt Project: The army now has a new robotics ...   
3  HENOSIS™ is a mysterious 2D Platform Puzzler w...   
4  ABOUT THE GAME Play as a hacker who has arrang...   

                    

In [None]:
# I want to examine each of the variables in the dataset and see how they influence the game sales.

# For each variable, create a scatter plot to see the relationship between the variable and the game sales

import matplotlib.pyplot as plt

for column in data.columns:
    if column != 'user_score':
        new_df = data[[column, 'user_score']].dropna()
        plt.scatter(new_df[column], new_df['user_score'])
        plt.xlabel(column)
        plt.ylabel('User Score')
        plt.title(f'Scatter plot of {column} vs User Score')
        plt.show()
        plt.close()




KeyError: 'game_sales'