In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('TestSession').getOrCreate()
spark

In [3]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, BooleanType, MapType, ArrayType

schema = StructType([
    StructField("Version", IntegerType(), True),
    StructField("SiteName", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("TopCountryShares", ArrayType(StructType([
        StructField("Country", IntegerType(), True),
        StructField("CountryCode", StringType(), True),
        StructField("Value", DoubleType(), True),
    ])), True),
    StructField("Title", StringType(), True),
    StructField("Engagments", StructType([
        StructField("BounceRate", StringType(), True),
        StructField("Month", StringType(), True),
        StructField("Year", StringType(), True),
        StructField("PagePerVisit", StringType(), True),
        StructField("Visits", StringType(), True),
        StructField("TimeOnSite", StringType(), True),
    ]), True),
    StructField("EstimatedMonthlyVisits", MapType(StringType(), IntegerType()), True),
    StructField("GlobalRank", StructType([
        StructField("Rank", IntegerType(), True),
    ]), True),
    StructField("CountryRank", StructType([
        StructField("Country", StringType(), True),
        StructField("CountryCode", StringType(), True),
        StructField("Rank", IntegerType(), True),
    ]), True),
    StructField("CategoryRank", StructType([
        StructField("Rank", IntegerType(), True),
        StructField("Category", StringType(), True),
    ]), True),
    StructField("IsSmall", BooleanType(), True),
    StructField("Policy", IntegerType(), True),
    StructField("TrafficSources", StructType([
        StructField("Social", DoubleType(), True),
        StructField("Paid Referrals", DoubleType(), True),
        StructField("Mail", DoubleType(), True),
        StructField("Referrals", DoubleType(), True),
        StructField("Search", DoubleType(), True),
        StructField("Direct", DoubleType(), True),
    ]), True),
    StructField("Category", StringType(), True),
    StructField("LargeScreenshot", StringType(), True),
    StructField("IsDataFromGa", BooleanType(), True),
    StructField("Countries", ArrayType(StructType([
        StructField("Code", StringType(), True),
        StructField("UrlCode", StringType(), True),
        StructField("Name", StringType(), True),
    ])), True),
    StructField("Competitors", StructType([
        StructField("TopSimilarityCompetitors", ArrayType(StructType([
            StructField("Domain", StringType(), True),
        ])), True),
    ]), True),
])


In [4]:
try:
    steam_traffic = spark.read.json(
        r'C:\Users\sbhuv\Downloads\Steam-Data-Engineering-Project\Steam-Data-Engineering-Project\data\monthly_data\2023-08-04_similarweb_data.json',
        multiLine=True,
        schema=schema  
    )
    steam_traffic.cache()
    steam_traffic.show()
except Exception as e:
    print("An error occurred while reading the JSON file:", e)


+-------+--------------------+--------------------+--------------------+----------------+--------------------+----------------------+----------+------------------+------------+-------+------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+
|Version|            SiteName|         Description|    TopCountryShares|           Title|          Engagments|EstimatedMonthlyVisits|GlobalRank|       CountryRank|CategoryRank|IsSmall|Policy|      TrafficSources|            Category|     LargeScreenshot|IsDataFromGa|           Countries|         Competitors|
+-------+--------------------+--------------------+--------------------+----------------+--------------------+----------------------+----------+------------------+------------+-------+------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+
|      1|store.steampowere...|steam is the ulti...|[{840, US, 0.1445..

In [7]:
engagements_data = steam_traffic.select("Engagments.BounceRate",
                                        "Engagments.Month",
                                        "Engagments.Year",
                                        "Engagments.PagePerVisit",
                                        "Engagments.Visits",
                                        "Engagments.TimeOnSite")

# Show the extracted data
engagements_data.show()

+-------------------+-----+----+-----------------+------------------+------------------+
|         BounceRate|Month|Year|     PagePerVisit|            Visits|        TimeOnSite|
+-------------------+-----+----+-----------------+------------------+------------------+
|0.48607075874857913|    6|2023|4.539245279963454|172451708.64032623|202.83973499447066|
+-------------------+-----+----+-----------------+------------------+------------------+



In [8]:
# Selecting the specific column from the DataFrame
estimated_monthly_visits = steam_traffic.select("EstimatedMonthlyVisits")

# Show the extracted data
estimated_monthly_visits.show(truncate=False)

+---------------------------------------------------------------------------+
|EstimatedMonthlyVisits                                                     |
+---------------------------------------------------------------------------+
|{2023-04-01 -> 172299056, 2023-05-01 -> 165019977, 2023-06-01 -> 172451708}|
+---------------------------------------------------------------------------+



In [9]:
traffic_sources = steam_traffic.select("TrafficSources")
traffic_sources.show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------+
|TrafficSources                                                                                                                 |
+-------------------------------------------------------------------------------------------------------------------------------+
|{0.06221286220003415, 0.0040070853258729906, 0.02070590728941937, 0.09180929101643757, 0.45005046953609584, 0.3712143846321399}|
+-------------------------------------------------------------------------------------------------------------------------------+

