In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
# Reads the merged dataframe from a pickle file
merged_df = pd.read_pickle("../data/interim/merged_data.pkl")

In [None]:
merged_df

In [None]:
# Extracts the release year from the release date into a new column
merged_df["release_year"] = pd.to_datetime(merged_df["release"]).dt.year


In [None]:
# Estimates copies sold using user reviews as a proxy, with different multipliers 
# based on release year
def estimate_copies_sold(row):
    year = row["release_year"]
    reviews = row["user_reviews"]

    if year < 2014:
        multiple = 60
    elif 2014 <= year <= 2016:
        multiple = 50
    elif year == 2017:
        multiple = 40
    elif 2018 <= year <= 2019:
        multiple = 35
    else:  # 2019 and later
        multiple = 30

    return reviews * multiple

# Creates a new column for estimated copies sold using the defined function 
# and release year column
merged_df["copies_sold_reviews_proxy"] = merged_df.apply(estimate_copies_sold, axis=1)


In [None]:
# Calculates estimated revenue by multiplying price with estimated copies sold
merged_df["estimated_revenue"] = merged_df["Price"] * merged_df["copies_sold_reviews_proxy"]

In [None]:
merged_df

In [None]:
# Create the log-transformed revenue with log1p to handle zero values safely 
# because log(0) is undefined if there are any zero revenues
merged_df["log_estimated_revenue"] = np.log1p(merged_df["estimated_revenue"])

# Set up the figure
plt.figure(figsize=(12, 5))

# Plot 1 — Raw Revenue
# First plot with raw estimated revenue to show the wide range
plt.subplot(1, 2, 1)
plt.hist(merged_df["estimated_revenue"], bins=50, edgecolor='black')
plt.title("Estimated Revenue (Raw Scale)")
plt.xlabel("Revenue ($)")
plt.ylabel("Count")
plt.xscale("log")  # optional: makes large range easier to see

# Plot 2 — Log-Normalized Revenue
# Second plot with log-normalized estimated revenue to show distribution better
plt.subplot(1, 2, 2)
plt.hist(merged_df["log_estimated_revenue"], bins=50, edgecolor='black', color='orange')
plt.title("Estimated Revenue (Log-Normalized)")
plt.xlabel("log(1 + Revenue)")
plt.ylabel("Count")

plt.tight_layout()
plt.show()

# Normalize plots side-by-side for comparison helpful for understanding distribution

In [None]:
# Realized the need to drop free-to-play games for revenue analysis
merged_df["f2p_flag"] = merged_df["Price"] == 0

In [None]:
merged_df

In [None]:
# New dataframe excluding free-to-play games for revenue analysis
paid_df = merged_df[merged_df["Price"] > 0].copy()

In [None]:

# Set up the figure
plt.figure(figsize=(12, 5))

# Plot 1 — Raw Revenue
plt.subplot(1, 2, 1)
plt.hist(paid_df["estimated_revenue"], bins=50, edgecolor='black')
plt.title("Estimated Revenue (Raw Scale)")
plt.xlabel("Revenue ($)")
plt.ylabel("Count")
plt.xscale("log")  # optional: makes large range easier to see

# Plot 2 — Log-Normalized Revenue
plt.subplot(1, 2, 2)
plt.hist(paid_df["log_estimated_revenue"], bins=50, edgecolor='black', color='orange')
plt.title("Estimated Revenue (Log-Normalized)")
plt.xlabel("log(1 + Revenue)")
plt.ylabel("Count")

plt.tight_layout()
plt.show()

# Same plots but excluding free-to-play games for more accurate revenue analysis

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(paid_df["user_reviews"], paid_df["estimated_revenue"], alpha=0.6, color='orange', edgecolor='k')
plt.title("Game Estimated Revenue vs User Reviews (Paid Games)")
plt.xlabel("User Reviews")
plt.ylabel("Estimated Revenue ($)")
plt.xscale("log")
plt.yscale("log")
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

# New scatter plot focusing on paid games to see correlation between user reviews and estimated revenue



In [None]:
# Create review bins by logarithmic spacing to handle wide range of review counts
# Logspace bins take in parameters: start, stop, number of bins
# These paramaters will create 30 bins spaced evenly on a log scale between the min and max review counts
bins = np.logspace(np.log10(paid_df["user_reviews"].min()+1),
                   np.log10(paid_df["user_reviews"].max()), 30)

# Review_bin column categorizes user reviews into these logarithmic bins
# pd.cut assigns each review count to a bin
paid_df["review_bin"] = pd.cut(paid_df["user_reviews"], bins=bins)


In [None]:
paid_df["review_bin"]

In [None]:
# Trend DataFrame to hold average estimated revenue per review bin
trend = paid_df.groupby("review_bin", observed=True)["estimated_revenue"].mean().reset_index()
# Midpoint of each bin for plotting by calculating the mid value of each interval
trend["midpoint"] = [interval.mid for interval in trend["review_bin"]]

In [None]:
trend

In [None]:
# Scatter plot with smoothed trend line showing average estimated revenue per review bin
plt.figure(figsize=(10,6))
# Plots the raw data points in light gray for context
plt.scatter(paid_df["user_reviews"], paid_df["estimated_revenue"], alpha=0.3, color='lightgray', s=10)
plt.plot(trend["midpoint"], trend["estimated_revenue"], color='steelblue', linewidth=3)
plt.xscale("log")
plt.yscale("log")
plt.title("Smoothed Relationship: User Reviews vs Estimated Revenue")
plt.xlabel("User Reviews")
plt.ylabel("Estimated Revenue ($)")
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()

In [None]:
merged_df