In [None]:
# converting db to sqlite file
# import pandas as pd
# from sqlalchemy import create_engine
# # PostgreSQL connection (adjust credentials!)
# pg_engine = create_engine('postgresql+psycopg2://postgres:01dengyuejia@localhost:5432/games')
# # SQLite connection
# sqlite_engine = create_engine('sqlite:///revenue.sqlite')
# # List of tables to migrate (change as needed)
# tables_to_migrate = ['revenue_data'] 
# for table in tables_to_migrate:
#     # Load table from PostgreSQL
#     df = pd.read_sql_table(table, pg_engine)
#     # Save to SQLite
#     df.to_sql(table, sqlite_engine, index=False, if_exists='replace')
#     print(f":white_check_mark: Migrated table: {table}")

:white_check_mark: Migrated table: revenue_data


In [1]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as pyo

sqlite_file = 'revenue.sqlite'
conn = sqlite3.connect(sqlite_file)

query = "SELECT * FROM revenue_data"
df = pd.read_sql_query(query, conn)

conn.close()

# print(df.head())
print(df.dtypes)

price_series = df['price']
min_price = price_series.min()
max_price = price_series.max()

revenue_series = df['revenue']
min_revenue = revenue_series.min()
max_revenue = revenue_series.max()

print(f"Minimum Price: {min_price}")
print(f"Maximum Price: {max_price}")
print(f"Price Range: {min_price} to {max_price}")
print(f"Minimum Revenue: {min_revenue}")
print(f"Maximum Price: {max_revenue}")
print(f"Price Range: {min_revenue} to {max_revenue}")

id                   int64
name                object
release_date        object
copies_sold         object
price              float64
revenue            float64
avg_playtime       float64
review_score         int64
publisher_class     object
publishers          object
developers          object
steam_id             int64
dtype: object
Minimum Price: 0.0
Maximum Price: 99.99
Price Range: 0.0 to 99.99
Minimum Revenue: 20674.0
Maximum Price: 837793356.0
Price Range: 20674.0 to 837793356.0


In [4]:
#Create scatter plot for price vs. revenue, this will not be on dashboard, but use for analysis
df['price'] = pd.to_numeric(df['price'], errors='coerce').astype(float)
df['price'] = pd.to_numeric(df['price'], errors='coerce').astype(float)
fig1 = px.scatter(df, x='price', y='revenue', title='Price vs. Revenue',
                labels={'price': 'Price', 'revenue': 'Revenue'},
                hover_data=['name'])
fig1.show()

# pyo.plot(fig1, filename='archive/price_vs_revenue_plotly.html')

In [48]:
#scatter plot for avgPlatime vs. revenue
fig2 = px.scatter(df, x='avg_playtime', y='revenue', title='Average Playtime vs. Revenue',
                labels={'avg_playtime': 'Average Playtime', 'revenue': 'Revenue'})
fig2.show()
# pyo.plot(fig2, filename='archive/avg_playtime_vs_revenue_plotly.html')

'archive/avg_playtime_vs_revenue_plotly.html'

In [5]:
#bar chart for avgPlatime vs. revenue
bins = range(0, int(df['avg_playtime'].max()) + 10, 20)
labels = [f'{i}-{i+19}' for i in bins[:-1]]
df['playtime_bin'] = pd.cut(df['avg_playtime'], bins=bins, labels=labels, right=False)
grouped_df = df.groupby('playtime_bin')['revenue'].mean().reset_index()

fig3 = px.bar(grouped_df, x='playtime_bin', y='revenue', title='Average Revenue by Average Playtime (Binned)',
            labels={'playtime_bin': 'Average Playtime (Bin)', 'revenue': 'Average Revenue'})
fig3.update_layout(xaxis_tickangle=-45)
fig3.show()
# pyo.plot(fig3, filename='archive/avg_playtime_vs_revenue_bar_plotly.html')





In [75]:
#scatter plot for price vs. copies sold
df['copies_sold'] = df['copies_sold'].astype(int)
fig4 = px.scatter(df, x='price', y='copies_sold', title='Price vs. Copies Sold',
                labels={'price': 'Price', 'copies_sold': 'Copies Sold'},
                hover_data=['name'])
fig4.show()
# pyo.plot(fig4, filename='archive/price_vs_copies_sold_plotly.html')

'archive/price_vs_copies_sold_plotly.html'

In [54]:
#top 10 games by copies sold
df_filtered = df[df['copies_sold'].notna()]
top10_sold = df_filtered.sort_values('copies_sold', ascending=False).head(10)

# Plot
fig5 = px.bar(top10_sold,x='copies_sold',y='name',orientation='h',
    title='Top 10 Games by Copies Sold',
    labels={'copies_sold': 'Copies Sold', 'name': 'Game Title'}
)
fig5.update_layout(yaxis={'categoryorder': 'total ascending'})
fig5.show()
pyo.plot(fig5, filename='archive/top10_copies_sold_plotly.html')


'archive/top10_copies_sold_plotly.html'

In [57]:
#Revenue vs. release month
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_date'] = df['release_date'].dt.strftime('%Y-%m')

# Filter and sort data
grouped_df = df.groupby('release_date')['revenue'].sum().reset_index()

# Plot
fig6 = px.bar(grouped_df,x='release_date',y='revenue',
    title='Total Revenue by Release Month',
    labels={'release_date': 'Release Month (YYYY-MM)', 'revenue': 'Total Revenue'}
)
fig6.update_layout(xaxis_tickangle=-45)
fig6.show()

pyo.plot(fig6, filename='archive/revenue_by_release_month_plotly.html')

'archive/revenue_by_release_month_plotly.html'

In [63]:
#Top 10 games by revenue
top10_revenue = df.sort_values('revenue', ascending=False).head(10)
fig7 = px.bar(top10_revenue, x='revenue', y='name',  orientation='h', title='Top 10 Games by Revenue',
            labels={'revenue': 'Revenue', 'name': 'Game Title'})
fig7.update_layout(yaxis={'categoryorder':'total ascending'})
fig7.show()
pyo.plot(fig7, filename='archive/top_10_games_by_revenue.html')

'archive/top_10_games_by_revenue.html'