In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
main_df = pd.read_csv("../data/raw/initial_games_data.csv")
platform_df = pd.read_csv("../data/raw/platform_game_data.csv")
categories_tags_and_genre_df = pd.read_csv("../data/raw/games.csv", index_col=False)

In [None]:
print(f"Main Dataset: {main_df.shape[0]} Second dataset: {platform_df.shape[0]} Third dataset: {categories_tags_and_genre_df.shape[0]}")

In [None]:
#Picking the attributes to keep for dataset 3
categories_tags_and_genre_df = categories_tags_and_genre_df[["AppID", "Categories", "Genres", "Tags", "Price"]]
# Renaming AppID to app_id for consistency
categories_tags_and_genre_df = categories_tags_and_genre_df.rename(columns={"AppID": "app_id"})

In [None]:
# Drop unnecessary index column
main_df = main_df.drop(columns=["Unnamed: 0"]) 
# Keep only the first 4 columns
main_df = main_df.iloc[:, :4] 

In [None]:
# Keeps the first occurrence of each game via link
main_df = main_df.drop_duplicates(subset='link', keep='first')

In [None]:
# Extract app_id from the link column and convert to integer
main_df["app_id"] = main_df["link"].str.extract(r'/app/(\d+)/')
main_df['app_id'] = main_df['app_id'].astype(int) 

In [None]:
# Drops the link column since we have the app_id now
main_df = main_df.drop(columns=['link'])

In [None]:
# Reorder columns to have 'app_id' first, not necessary but cleaner
cols = ['app_id'] + [c for c in main_df.columns if c != 'app_id']
main_df = main_df[cols]

In [None]:
platform_df = platform_df.drop(columns=[
    "rating", "positive_ratio", "price_original", "price_final", "discount", "steam_deck"
    ])

In [None]:
merged_df = pd.merge(main_df, platform_df, on='app_id')

In [None]:
game_dupes = merged_df["game"].duplicated().sum()
title_dupes = merged_df["title"].duplicated().sum()

In [None]:
# Drops game column and date_release since we have title and release_year
merged_df = merged_df.drop(columns=["game", "date_release"])

# Merging the third dataset for categories, tags, and genre, and price
merged_df = pd.merge(
    merged_df,
    categories_tags_and_genre_df,
    on="app_id",
    how="inner" 
)

# print(merged_df.info())
print(merged_df.head(10))


In [None]:
merged_df.to_pickle("../data/interim/initial_merged_data.pkl")