In [1]:
from pyspark.sql import SparkSession
import os
spark = SparkSession.builder.appName('weekly_spark').getOrCreate()
#spark

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, ArrayType
from pyspark.sql.functions import length, col, count, expr, monotonically_increasing_id, lit

reviews_schema = StructType([
    StructField("App ID", IntegerType(), True),
    StructField("Review", StringType(), True),
    StructField("Voted Up", StringType(), True)
])


top_sellers_schema = StructType([
    StructField("Rank", IntegerType(), True),
    StructField("Game Name", StringType(), True),
    StructField("Free to Play", IntegerType(), True),
    ])

top_sellers_appids_schema = StructType([
    StructField("App ID", IntegerType(), True),
])

In [3]:
WEEKLY_DATA_PATH = r'../data/weekly_data/'
reviews_path = os.path.join(WEEKLY_DATA_PATH, 'reviews/')
FILE_DATE = None

try:
    csv_files = [f for f in os.listdir(reviews_path) if f.endswith('.csv')]
    if csv_files:
        csv_file = csv_files[0]
        FILE_DATE = csv_file[0].split('.')[0].split('_')[0]
        csv_file_path = os.path.join(reviews_path, csv_file)
        most_daily_played = spark.read.csv(csv_file_path, header=True, schema=reviews_schema)
    else:
        print("No CSV files found in the 'reviews_path' directory.")
except Exception as e:
    print("An error occurred while reading the CSV file:", e)

In [4]:
# Cleaning the data
most_daily_played = most_daily_played.na.drop(subset=["Review", "Voted Up", "App ID"])
most_daily_played = most_daily_played.filter(length(col("Review")) >= 2)

# Counting the number of positive and negative reviews
counted_reviews = most_daily_played.groupBy("App ID").pivot("Voted Up", ["pos", "neg"]).agg(count("*").alias("count"))

# Seprarating the positive and negative reviews
neg_reviews_df = most_daily_played.filter(most_daily_played["Voted Up"] == "neg")
pos_reviews_df = most_daily_played.filter(most_daily_played["Voted Up"] == "pos")

neg_reviews_df = neg_reviews_df.withColumn("FILE_DATE", lit(FILE_DATE))
pos_reviews_df = pos_reviews_df.withColumn("FILE_DATE", lit(FILE_DATE))
counted_reviews = counted_reviews.withColumn("FILE_DATE", lit(FILE_DATE))

In [5]:
#Top_sellers
WEEKLY_TOP_SELLERS_PATH = WEEKLY_DATA_PATH + r'top_sellers/'
files = os.listdir(WEEKLY_TOP_SELLERS_PATH)

FILE_DATE = None
try:
    csv_file1 = [f for f in files if f.endswith('weekly_top_sellers.csv')]
    FILE_DATE = csv_file1[0].split('.')[0].split('_')[0]
    top_sellers_games = spark.read.csv(
        WEEKLY_TOP_SELLERS_PATH + csv_file1[0],
        header=True,
        schema=top_sellers_schema  
    )
    csv_file2 = [f for f in files if f.endswith('weekly_top_sellers_appIds.csv')]
    FILE_DATE = csv_file2[0].split('.')[0].split('_')[0]
    top_sellers_appids = spark.read.csv(
        WEEKLY_TOP_SELLERS_PATH + csv_file2[0],
        header=True,
        schema=top_sellers_appids_schema  
    )
    top_sellers_appids = top_sellers_appids.withColumn(
    "Rank",(monotonically_increasing_id() + 1).cast("int"))
    top_sellers = top_sellers_games.join(top_sellers_appids, on=["Rank"], how="inner")
    top_sellers = top_sellers.withColumn("FILE_DATE", lit(FILE_DATE))
    #top_sellers.show()   

except Exception as e:
    print("An error occurred while reading the CSV file:", e)

In [6]:
#news
import pandas as pd
import json
import re

WEEKLY_NEWS_PATH = WEEKLY_DATA_PATH + r'news/'
files = os.listdir(WEEKLY_NEWS_PATH)
dfs = []

try:
    for file in files:
        with open(WEEKLY_NEWS_PATH + file, 'r') as json_file:
            json_data = json.load(json_file)

            df = pd.DataFrame() 
            for i in range(len(json_data['appnews']['newsitems'])):
                column_name = f"contents_{i}" 
                value = json_data['appnews']['newsitems'][i]['contents']
                if isinstance(value, (list, dict)):
                    value = str(value)
                df[column_name] = [value]
            df['App ID'] = json_data['appnews']['appid']
            dfs.append(df)

except Exception as e:
    print("An error occurred while reading the JSON file:", e)
    
news_df = pd.concat(dfs, ignore_index=True)


columns_to_iterate = news_df.columns[:-1]
for column in columns_to_iterate:
    for index, cell in enumerate(news_df[column]):
        if cell is not None:
            cleaned_text = re.sub(r'<strong>|</strong>|<a>|</a>|<a\s+href\s*=\s*".*?"\s*>\s*|https://.*|\{STEAM_CLAN_IMAGE\}.*', '', cell)
            news_df.at[index, column] = cleaned_text

news_df.to_csv(r'../data/weekly_data/news/spark_modified_news.csv', index=False, sep='\t')
news_path = os.path.join(WEEKLY_DATA_PATH, 'news/')

try:
    csv_files = [f for f in os.listdir(news_path) if f.endswith('.csv')]
    if csv_files:
        csv_file = csv_files[0]
        FILE_DATE = csv_file.split('.')[0].split('_')[0]  
        csv_file_path = os.path.join(news_path, csv_file)
        news_spark_df = spark.read.option("delimiter", "\t").csv(csv_file_path, header=True)
    else:
        print("No CSV files found in the 'news_path' directory.")
except Exception as e:
    print("An error occurred while reading the CSV file:", e)


An error occurred while reading the JSON file: 'charmap' codec can't decode byte 0x81 in position 15334: character maps to <undefined>


In [7]:
neg_reviews_path = r"../saved_data/weekly_data/neg_reviews"
pos_reviews_path = r"../saved_data/weekly_data/pos_reviews"
counted_reviews_path = r"../saved_data/weekly_data/counted_reviews"
top_sellers_path = r"../saved_data/weekly_data/top_sellers"
news_spark_path = r"../saved_data/weekly_data/news_spark_df"

# Save the DataFrame as CSV
neg_reviews_df.write.format("csv").mode("overwrite").option("header", "true").save(neg_reviews_path)
pos_reviews_df.write.format("csv").mode("overwrite").option("header", "true").save(pos_reviews_path)
counted_reviews.write.format("csv").mode("overwrite").option("header", "true").save(counted_reviews_path)
top_sellers.write.format("csv").mode("overwrite").option("header", "true").save(top_sellers_path)
news_spark_df.write.format("csv").mode("overwrite").option("header", "true").save(news_spark_path)

AttributeError: 'str' object has no attribute 'write'