In [17]:
from pyspark.sql import SparkSession
import os
spark = SparkSession.builder.appName('weekly_spark').getOrCreate()
spark

In [18]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, ArrayType
from pyspark.sql.functions import length, col, count, expr

reviews_schema = StructType([
    StructField("App ID", IntegerType(), True),
    StructField("Review", StringType(), True),
    StructField("Voted Up", StringType(), True)
])

In [19]:
WEEKLY_DATA_PATH = r'../data/weekly_data/'
reviews_path = os.path.join(WEEKLY_DATA_PATH, 'reviews/')
FILE_DATE = None

try:
    csv_files = [f for f in os.listdir(reviews_path) if f.endswith('.csv')]
    if csv_files:
        csv_file = csv_files[0]
        csv_file_path = os.path.join(reviews_path, csv_file)
        most_daily_played = spark.read.csv(csv_file_path, header=True, schema=reviews_schema)
    else:
        print("No CSV files found in the 'reviews_path' directory.")
except Exception as e:
    print("An error occurred while reading the CSV file:", e)

In [21]:
most_daily_played = most_daily_played.na.drop(subset=["Review", "Voted Up", "App ID"])
most_daily_played = most_daily_played.filter(length(col("Review")) >= 2)

counted_reviews = most_daily_played.groupBy("App ID").pivot("Voted Up", ["pos", "neg"]).agg(count("*").alias("count"))
counted_reviews = counted_reviews.fillna(0)

In [22]:
counted_reviews.show()

+-------+---+---+
| App ID|pos|neg|
+-------+---+---+
|1272080| 15| 15|
|1716740| 17| 10|
|1085660| 17| 20|
|1599340| 15| 14|
|1627720| 17|  9|
|1971870| 18| 16|
|    730| 14| 15|
|1172470| 18| 16|
|1086940| 19| 15|
+-------+---+---+



In [23]:
csv_file = [f for f in reviews_path if f.endswith('.csv')]

In [24]:
reviews_path

'../data/weekly_data/reviews/'

In [25]:
#Top_sellers
schema_top_sellers = StructType([
    StructField("Rank", IntegerType(), True),
    StructField("Game Name", StringType(), True),
    StructField("Free to Play", IntegerType(), True),
    StructField("App ID", IntegerType(), True),
    StructField("Collection Date", StringType(), True)
    ])

WEEKLY_TOP_SELLERS_PATH = WEEKLY_DATA_PATH + r'top_sellers/'
files = os.listdir(WEEKLY_TOP_SELLERS_PATH)

FILE_DATE = None
try:
    csv_file = [f for f in files if f.endswith('.csv')]
    file = csv_file[0]
    FILE_DATE = file.split('.')[0].split('_')[0]
    top_sellers = spark.read.csv(
        WEEKLY_TOP_SELLERS_PATH + file,
        header=True,
        schema=schema_top_sellers  
    )
    top_sellers.cache()
    top_sellers.show()
except Exception as e:
    print("An error occurred while reading the CSV file:", e)

+----+--------------------+------------+------+---------------+
|Rank|           Game Name|Free to Play|App ID|Collection Date|
+----+--------------------+------------+------+---------------+
|   1|CounterStrike Glo...|           1|  null|           null|
|   2|          Steam Deck|           0|  null|           null|
|   3|           Starfield|           0|  null|           null|
|   4|      Baldurs Gate 3|           0|  null|           null|
|   5|     Mortal Kombat 1|           0|  null|           null|
|   6|            PAYDAY 3|           0|  null|           null|
|   7|            Lost Ark|           1|  null|           null|
|   8|           Lies of P|           0|  null|           null|
|   9|           Destiny 2|           1|  null|           null|
|  10|        Apex Legends|           1|  null|           null|
|  11|      Cyberpunk 2077|           0|  null|           null|
|  12|        Call of Duty|           0|  null|           null|
|  13|Cyberpunk 2077 Ph...|           0|

In [26]:
#news
weekly_top_news_schema = StructType([
    StructField("appnews", StructType([
        StructField("appid", IntegerType(), nullable=False),
        StructField("newsitems", ArrayType(StructType([
            StructField("gid", StringType(), nullable=False),
            StructField("title", StringType(), nullable=False),
            StructField("url", StringType(), nullable=False),
            StructField("is_external_url", BooleanType(), nullable=False),
            StructField("author", StringType(), nullable=False),
            StructField("contents", StringType(), nullable=False),
            StructField("feedlabel", StringType(), nullable=False),
            StructField("date", IntegerType(), nullable=False),
            StructField("feedname", StringType(), nullable=False),
            StructField("feed_type", IntegerType(), nullable=False),
            StructField("appid", IntegerType(), nullable=False),
            StructField("tags", ArrayType(StringType(), containsNull=False), nullable=True)
        ]), containsNull=True), nullable=False),
        StructField("count", IntegerType(), nullable=False)
    ]), nullable=True)
])

merged_df = None
WEEKLY_TOP_NEWS_PATH = WEEKLY_DATA_PATH + r'news/'
files = os.listdir(WEEKLY_TOP_NEWS_PATH)
try:
    json_files = [pos_json for pos_json in files if pos_json.endswith('.json')]
    for file in json_files:
        steam_game_news = spark.read.json(
            WEEKLY_TOP_NEWS_PATH + file,
            multiLine=True,
            schema = weekly_top_news_schema     
        )
        if merged_df is None:
            merged_df = steam_game_news
        else:
            merged_df = merged_df.union(steam_game_news)
except:
    print('No json files found')


In [27]:
merged_df.show(5)

+--------------------+
|             appnews|
+--------------------+
|{730, [{521916535...|
|{1675200, [{52191...|
|{1716740, [{52191...|
|{1086940, [{52191...|
|{1971870, [{52191...|
+--------------------+
only showing top 5 rows



In [28]:
steam_game_news.show()

+--------------------+
|             appnews|
+--------------------+
|{1172470, [{54803...|
+--------------------+



In [29]:
#reviews
WEEKLY_TOP_10_REVIEWS_PATH = WEEKLY_DATA_PATH + r'reviews/'
files = os.listdir(WEEKLY_DATA_PATH)
try:
    csv_file = [f for f in files if f.endswith('.txt')]
    for file in csv_file:
        reviews = spark.read.text(
            WEEKLY_TOP_10_REVIEWS_PATH + file,
        )
        reviews.cache()
        reviews.show()
        break
except Exception as e:
    print("An error occurred while reading the CSV file:", e)